/*
 * This is a demonstration of unwanted SMP scheduler side-effects
 * caused by high-priority threads.
 *
 * In this demo, we have three threads:
 *
 *   1. A busy-loop (CPU bound) at nice level 0.
 *
 *   2. Another busy-loop at nice level 0.
 *
 *   3. A high-priority thread (nice -10) which spends
 *      most of its time sleeping, but it wakes up
 *      frequently and spends a little bit of CPU each time.
 *
 * This is meant to model three of our threads in the vmware-vmx: a
 * VCPU thread, the MKS thread, and the main VMX thread. Ideally, the
 * VCPU and MKS would spend most of their time running on separate
 * CPUs on an SMP system. The VMX thread would wake up frequently,
 * interrupt an arbitrary cpu-bound thread, then go back to sleep.
 *
 * The actual behaviour I see on Linux 2.6 is that the system
 * oscillates between two states, with a period of a few seconds. In
 * the "good" state, the two busy-loop threads run on separate CPU. In
 * the "bad" state, both of the busy-loop threads run on the same
 * physical CPU and the other CPU sits idle.
 *
 * Taking a closer look at the kernel's scheduler debug output
 * (/proc/sched_debug and /proc/schedstat) the problem becomes
 * clearer: Even though the VMX thread spends very little time
 * running, its runtime is given extra weight according to its
 * priority. The "load" calculated by the scheduler for its CPU is
 * low, but the load average calculated via delta_exec and delta_fair
 * can become quite high. The result is that the physical CPU where
 * the VMX was running gets stuck with a high load average even after
 * the VMX thread is sleeping again. This high load average causes all
 * running tasks to be rebalanced onto the other CPU until the high
 * load subsides.
 *
 * This example requires a machine with exactly 2 CPUs.
 *
 * Usage:
 *   cc -o priosched priosched.c -lpthread
 *   sudo ./priosched
 *
 * Now observe the load on both CPUs. In the "good" state, both CPUs
 * will be busy. In the "bad" state, both of the busyThreads will be
 * stuck on the same CPU and the other CPU will be idle. 
 *
 * If you have a kernel with scheduler debugging compiled in, try "cat
 * /proc/sched_debug". In the "bad" state, one CPU will have an empty
 * runnable task list and a list of cpu_load[] averages around 9000.
 *
 * -- Micah Dowty <micah@vmware.com>
 */

#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <linux/ioctl.h>
#include <linux/rtc.h>
#include <time.h>

/*
 * Knobs.
 * You may have to tweak these to reproduce the problem on your machine.
 */
#define NUM_BUSY_THREADS          2
#define MAINTHREAD_PRIORITY     -15   // Nice level
#define MAINTHREAD_WAKE_HZ      256   // Frequency to wake up at
#define MAINTHREAD_LOAD_PERCENT   5   // Percent of time to wake up and generate load
#define MAINTHREAD_LOAD_CYCLES   10   // Consecutive clock ticks to generate load for

void  *busyThreadFunc(void *arg)
{
   while (1);
}

int main()
{
   pthread_t busyThreads[NUM_BUSY_THREADS];
   int i, rtc;

   for (i = 0; i < NUM_BUSY_THREADS; i++) {
      if (pthread_create(&busyThreads[i], NULL, busyThreadFunc, NULL)) {
         perror("pthread_create");
         return 1;
      }
   }

   if (nice(MAINTHREAD_PRIORITY) == -1) {
      fprintf(stderr, "This program must be run as root.\n");
      return 1;
   }

   rtc = open("/dev/rtc", O_RDONLY);
   if (rtc == -1) {
      perror("/dev/rtc");
      return 1;
   }

   if (ioctl(rtc, RTC_IRQP_SET, MAINTHREAD_WAKE_HZ) ||
       ioctl(rtc, RTC_PIE_ON, 0)) {
      perror("ioctl");
      return 1;
   }

   while (1) {
      unsigned long data;
      if (read(rtc, &data, sizeof data) != sizeof data) {
         perror("read");
         return 1;
      }

      if (random() % 100 <= MAINTHREAD_LOAD_PERCENT) {
         for (i = 0; i < MAINTHREAD_LOAD_CYCLES; i++) {
            fcntl(rtc, F_SETFL, O_NONBLOCK);
            while (read(rtc, &data, sizeof data) < 0);
            fcntl(rtc, F_SETFL, 0);
         }
      }
   }

   return 0;
}