/* * This is a demonstration of unwanted SMP scheduler side-effects * caused by high-priority threads. * * In this demo, we have three threads: * * 1. A busy-loop (CPU bound) at nice level 0. * * 2. Another busy-loop at nice level 0. * * 3. A high-priority thread (nice -10) which spends * most of its time sleeping, but it wakes up * frequently and spends a little bit of CPU each time. * * This is meant to model three of our threads in the vmware-vmx: a * VCPU thread, the MKS thread, and the main VMX thread. Ideally, the * VCPU and MKS would spend most of their time running on separate * CPUs on an SMP system. The VMX thread would wake up frequently, * interrupt an arbitrary cpu-bound thread, then go back to sleep. * * The actual behaviour I see on Linux 2.6 is that the system * oscillates between two states, with a period of a few seconds. In * the "good" state, the two busy-loop threads run on separate CPU. In * the "bad" state, both of the busy-loop threads run on the same * physical CPU and the other CPU sits idle. * * Taking a closer look at the kernel's scheduler debug output * (/proc/sched_debug and /proc/schedstat) the problem becomes * clearer: Even though the VMX thread spends very little time * running, its runtime is given extra weight according to its * priority. The "load" calculated by the scheduler for its CPU is * low, but the load average calculated via delta_exec and delta_fair * can become quite high. The result is that the physical CPU where * the VMX was running gets stuck with a high load average even after * the VMX thread is sleeping again. This high load average causes all * running tasks to be rebalanced onto the other CPU until the high * load subsides. * * This example requires a machine with exactly 2 CPUs. * * Usage: * cc -o priosched priosched.c -lpthread * sudo ./priosched * * Now observe the load on both CPUs. In the "good" state, both CPUs * will be busy. In the "bad" state, both of the busyThreads will be * stuck on the same CPU and the other CPU will be idle. * * If you have a kernel with scheduler debugging compiled in, try "cat * /proc/sched_debug". In the "bad" state, one CPU will have an empty * runnable task list and a list of cpu_load[] averages around 9000. * * -- Micah Dowty */ #include #include #include #include #include #include #include #include #include /* * Knobs. * You may have to tweak these to reproduce the problem on your machine. */ #define NUM_BUSY_THREADS 2 #define MAINTHREAD_PRIORITY -15 // Nice level #define MAINTHREAD_WAKE_HZ 256 // Frequency to wake up at #define MAINTHREAD_LOAD_PERCENT 5 // Percent of time to wake up and generate load #define MAINTHREAD_LOAD_CYCLES 10 // Consecutive clock ticks to generate load for void *busyThreadFunc(void *arg) { while (1); } int main() { pthread_t busyThreads[NUM_BUSY_THREADS]; int i, rtc; for (i = 0; i < NUM_BUSY_THREADS; i++) { if (pthread_create(&busyThreads[i], NULL, busyThreadFunc, NULL)) { perror("pthread_create"); return 1; } } if (nice(MAINTHREAD_PRIORITY) == -1) { fprintf(stderr, "This program must be run as root.\n"); return 1; } rtc = open("/dev/rtc", O_RDONLY); if (rtc == -1) { perror("/dev/rtc"); return 1; } if (ioctl(rtc, RTC_IRQP_SET, MAINTHREAD_WAKE_HZ) || ioctl(rtc, RTC_PIE_ON, 0)) { perror("ioctl"); return 1; } while (1) { unsigned long data; if (read(rtc, &data, sizeof data) != sizeof data) { perror("read"); return 1; } if (random() % 100 <= MAINTHREAD_LOAD_PERCENT) { for (i = 0; i < MAINTHREAD_LOAD_CYCLES; i++) { fcntl(rtc, F_SETFL, O_NONBLOCK); while (read(rtc, &data, sizeof data) < 0); fcntl(rtc, F_SETFL, 0); } } } return 0; }