[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <D655C66D-8C52-4CE3-A00B-697735CFA51D@oracle.com>
Date: Mon, 13 May 2019 07:31:37 -0400
From: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To: Wanpeng Li <kernellwp@...il.com>,
Marcelo Tosatti <mtosatti@...hat.com>
CC: kvm-devel <kvm@...r.kernel.org>,
LKML <linux-kernel@...r.kernel.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...nel.org>,
Andrea Arcangeli <aarcange@...hat.com>,
Bandan Das <bsd@...hat.com>,
Paolo Bonzini <pbonzini@...hat.com>, ankur.a.arora@...cle.com,
Boris Ostrovsky <boris.ostrovsky@...cle.com>
Subject: Re: [PATCH] sched: introduce configurable delay before entering idle
On May 13, 2019 5:20:37 AM EDT, Wanpeng Li <kernellwp@...il.com> wrote:
>On Wed, 8 May 2019 at 02:57, Marcelo Tosatti <mtosatti@...hat.com>
>wrote:
>>
>>
>> Certain workloads perform poorly on KVM compared to baremetal
>> due to baremetal's ability to perform mwait on NEED_RESCHED
>> bit of task flags (therefore skipping the IPI).
>
>KVM supports expose mwait to the guest, if it can solve this?
>
There is a bit of problem with that. The host will see 100% CPU utilization even if the guest is idle and taking long naps..
Which depending on your dashboard can look like the machine is on fire.
CCing Ankur and Boris
>Regards,
>Wanpeng Li
>
>>
>> This patch introduces a configurable busy-wait delay before entering
>the
>> architecture delay routine, allowing wakeup IPIs to be skipped
>> (if the IPI happens in that window).
>>
>> The real-life workload which this patch improves performance
>> is SAP HANA (by 5-10%) (for which case setting idle_spin to 30
>> is sufficient).
>>
>> This patch improves the attached server.py and client.py example
>> as follows:
>>
>> Host: 31.814230202231556
>> Guest: 38.17718765199993 (83 %)
>> Guest, idle_spin=50us: 33.317709898000004 (95 %)
>> Guest, idle_spin=220us: 32.27826551499999 (98 %)
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@...hat.com>
>>
>> ---
>> kernel/sched/idle.c | 86
>++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 86 insertions(+)
>>
>> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
>> index f5516bae0c1b..bca7656a7ea0 100644
>> --- a/kernel/sched/idle.c
>> +++ b/kernel/sched/idle.c
>> @@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
>> rcu_idle_exit();
>> }
>>
>> +static unsigned int spin_before_idle_us;
>>
>> +static void do_spin_before_idle(void)
>> +{
>> + ktime_t now, end_spin;
>> +
>> + now = ktime_get();
>> + end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
>> +
>> + rcu_idle_enter();
>> + local_irq_enable();
>> + stop_critical_timings();
>> +
>> + do {
>> + cpu_relax();
>> + now = ktime_get();
>> + } while (!tif_need_resched() && ktime_before(now, end_spin));
>> +
>> + start_critical_timings();
>> + rcu_idle_exit();
>> + local_irq_disable();
>> +}
>> +
>> /*
>> * Generic idle loop implementation
>> *
>> @@ -259,6 +282,8 @@ static void do_idle(void)
>> tick_nohz_idle_restart_tick();
>> cpu_idle_poll();
>> } else {
>> + if (spin_before_idle_us)
>> + do_spin_before_idle();
>> cpuidle_idle_call();
>> }
>> arch_cpu_idle_exit();
>> @@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
>> .switched_to = switched_to_idle,
>> .update_curr = update_curr_idle,
>> };
>> +
>> +
>> +static ssize_t store_idle_spin(struct kobject *kobj,
>> + struct kobj_attribute *attr,
>> + const char *buf, size_t count)
>> +{
>> + unsigned int val;
>> +
>> + if (kstrtouint(buf, 10, &val) < 0)
>> + return -EINVAL;
>> +
>> + if (val > USEC_PER_SEC)
>> + return -EINVAL;
>> +
>> + spin_before_idle_us = val;
>> + return count;
>> +}
>> +
>> +static ssize_t show_idle_spin(struct kobject *kobj,
>> + struct kobj_attribute *attr,
>> + char *buf)
>> +{
>> + ssize_t ret;
>> +
>> + ret = sprintf(buf, "%d\n", spin_before_idle_us);
>> +
>> + return ret;
>> +}
>> +
>> +static struct kobj_attribute idle_spin_attr =
>> + __ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
>> +
>> +static struct attribute *sched_attrs[] = {
>> + &idle_spin_attr.attr,
>> + NULL,
>> +};
>> +
>> +static const struct attribute_group sched_attr_group = {
>> + .attrs = sched_attrs,
>> +};
>> +
>> +static struct kobject *sched_kobj;
>> +
>> +static int __init sched_sysfs_init(void)
>> +{
>> + int error;
>> +
>> + sched_kobj = kobject_create_and_add("sched", kernel_kobj);
>> + if (!sched_kobj)
>> + return -ENOMEM;
>> +
>> + error = sysfs_create_group(sched_kobj, &sched_attr_group);
>> + if (error)
>> + goto err;
>> + return 0;
>> +
>> +err:
>> + kobject_put(sched_kobj);
>> + return error;
>> +}
>> +postcore_initcall(sched_sysfs_init);
Powered by blists - more mailing lists