/*
 * Low-level routines for memory mirroring.  Tracks dirty pages and
 * utilizes provided copy routine to transfer pages which have changed
 * between harvest passes.
 *
 * Copyright (C) 2006 Stratus Technologies Bermuda Ltd.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
#include "fosil.h"
#include "syncd.h"
#include "harvest.h"
#include "smallpmd.h"

/*
 * The following ifdef allows harvest to be built against a kernel
 * which does not implement memory tracking.
 */
#ifdef CONFIG_TRACK_DIRTY_PAGES

#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <asm/io.h>
#include <asm/atomic.h>
#include <asm/tlb.h>
#include <asm/uaccess.h>

#define HARVEST_BY_TABLE 1
#define SMP_HARVEST 1
//#define USE_SMALL_PMD	1
#define USE_TASK_HARVEST	1

#define MAX_COPYRANGE	100	/* this is actually defined in cc/host.h */
#define MB (1024*1024)
#define NEARBY		6

#ifdef CONFIG_X86_PAE
typedef unsigned long long paddr_t;	/* physical address type */
#define PADDR_FMT "%09lx"		/* printf format for paddr_t */
#else
typedef unsigned long paddr_t;		/* physical address type */
#define PADDR_FMT "%08lx"		/* printf format for paddr_t */
#endif

enum {
	DBGLVL_NONE     = 0x00,
	DBGLVL_HARVEST  = 0x01,
	DBGLVL_BROWNOUT = 0x02,
	DBGLVL_TIMER    = 0x04,
};

static int dbgmask = DBGLVL_NONE;

#define DBGPRNT(lvl, args...)			\
({						\
	if (lvl & dbgmask)			\
		printk(args);			\
})

enum {
	IN_BROWNOUT = 0,
	IN_BLACKOUT = 1,
};

struct timer_set {
	unsigned long long before;
	unsigned long long after;
};

struct syncd_data {
	unsigned long flags[NR_CPUS];
	copy_proc copy;
	sync_proc sync;
	void * context;
	is_pfn_valid_proc is_pfn_valid;
	is_pfnrange_valid_proc is_pfnrange_valid;
	int status;		// return status from command inside blackout
	int max_range;		// # of valid ranges in following CopyRange
	fosil_page_range_t CopyRange[MAX_COPYRANGE];
	unsigned long nearby;	// insert merge tunable
	unsigned int  duration_in_ms;	// blackout during
	unsigned long bitcnt;	// number of bits find in last harvest
	int blackout;		// 1 == blackout, 0 == brownout
	int pass;		// # of times through harvest/process cycle
	int done;		// 1 == done - time to leave
	unsigned int threshold;	// number of dirty pages before sync
	atomic_t r;
};

#ifdef HARVEST_BY_TABLE
void harvest_table(pgd_t * pgd_base, unsigned long start, unsigned long end)
{
	unsigned long i, j, k, l;

	pgd_t *pgd;

	unsigned long pgd_start, pgd_end;
	unsigned long pud_start, pud_end;
	unsigned long pmd_start, pmd_end;
	unsigned long pte_start, pte_end;

	unsigned long pud_last;
	unsigned long pmd_last;
	unsigned long pte_last;

	/*
	 * end marks the first byte after the upper limit of the address
	 * range
	 */
	end--;

	if (!pgd_base) {
		printk("%s: null pgd?\n", __func__);
		return;
	}

	pgd_start = pgd_index(start);
	pgd_end = pgd_index(end);

	pud_start = pud_index(start);
	pud_end = pud_index(end);

	pmd_start = pmd_index(start);
	pmd_end = pmd_index(end);

	pte_start = pte_index(start);
	pte_end = pte_index(end);

	pgd = pgd_base + pgd_start;

	for (i = pgd_start; i <= pgd_end; i++, pgd++) {

		pud_t *pud;

		if (pgd_none(*pgd) || !pgd_present(*pgd))
			continue;

		if (i == pgd_start)
			j = pud_start;
		else
			j = 0;

		pud = pud_offset(pgd, 0) + j;

		if (i == pgd_end)
			pud_last = pud_end;
		else
			pud_last = PTRS_PER_PUD - 1;

		for (; j <= pud_last; j++, pud++) {
			pmd_t *pmd;
			unsigned long addr;
			pud_t tmp_pud = *pud;

			addr = i << PGDIR_SHIFT;
			addr += j << PUD_SHIFT;

			if (pud_none(tmp_pud) || !pud_present(tmp_pud))
				continue;

			if ((i == pgd_start) && (j == pud_start))
				k = pmd_start;
			else
				k = 0;

			pmd = pmd_offset(pud, 0) + k;

			if ((i == pgd_end) && (j == pud_end))
				pmd_last = pmd_end;
			else
				pmd_last = PTRS_PER_PMD - 1;

			for (; k <= pmd_last; k++, pmd++) {
				pte_t *pte;
				pmd_t tmp_pmd = *pmd;

				addr += k << PMD_SHIFT;

				if (pmd_none(tmp_pmd) || !pmd_present(tmp_pmd))
					continue;
				
				if (pmd_large(tmp_pmd)) {
					if (pmd_val(tmp_pmd) & _PAGE_DIRTY) {
						pmd_val(tmp_pmd) &= ~_PAGE_DIRTY;
						pmd_val(tmp_pmd) |= _PAGE_SOFTDIRTY;
						mm_track_pmd(pmd);
					}
				}

				if (pmd_val(tmp_pmd) != pmd_val(*pmd)) {
					*pmd = tmp_pmd;
					if (pmd_val(tmp_pmd) & _PAGE_GLOBAL)
						__flush_tlb_global();
					else
						__flush_tlb_one(addr);
					mm_track_phys((void*)virt_to_phys(pmd));
				}

				if (pmd_large(tmp_pmd))
					continue;

				if ((i == pgd_start) && (j == pud_start) && (k == pmd_start))
					l = pte_start;
				else
					l = 0;

				pte = pte_offset_kernel(pmd, 0) + l;

				if ((i == pgd_end) && (j == pud_end) && (k == pmd_end))
					pte_last = pte_end;
				else
					pte_last = PTRS_PER_PTE - 1;

				for (; l <= pte_last; l++, pte++) {

					unsigned long addr_pte = addr + (k << PAGE_SHIFT);
					pte_t tmp_pte = *pte;

					if (pte_val(tmp_pte) & _PAGE_DIRTY) {
						pte_val(tmp_pte) &= ~_PAGE_DIRTY;
						pte_val(tmp_pte) |= _PAGE_SOFTDIRTY;
						mm_track_pte(pte);
					}

					if (pte_val(tmp_pte) != pte_val(*pte)) {
						*pte = tmp_pte;
						if (pte_val(tmp_pte) & _PAGE_GLOBAL)
							__flush_tlb_global();
						else
							__flush_tlb_one(addr_pte);
						mm_track_phys((void*)virt_to_phys(pte));
					}
				}
			}
		}
	}
}
#else
static void harvest_page(pgd_t * pgd_base, unsigned long addr)
{
	// for all pages in this vm_area_struct
	pgd_t *pgd;
	pmd_t *pmd;
	pte_t *pte;

	DBGPRNT(DBGLVL_HARVEST, "pgd_base %p + offset %ld\n",
		pgd_base, pgd_index(addr));

	DBGPRNT(DBGLVL_HARVEST, "    pgd[0] %09lx\n", pgd_val(pgd_base[0]));
	DBGPRNT(DBGLVL_HARVEST, "    pgd[1] %09lx\n", pgd_val(pgd_base[1]));
	DBGPRNT(DBGLVL_HARVEST, "    pgd[2] %09lx\n", pgd_val(pgd_base[2]));
	DBGPRNT(DBGLVL_HARVEST, "    pgd[3] %09lx\n", pgd_val(pgd_base[3]));

	pgd = pgd_base + pgd_index(addr);
	if (pgd_none(*pgd))
		return;

	if (!pgd_present(*pgd))
		return;

	pmd = pmd_offset(pgd, addr);

	DBGPRNT(DBGLVL_HARVEST, "        pmd[%ld] %p is %lx\n",
		pmd_index(addr), pmd, pmd_val(*pmd));

	if (pmd_none(*pmd))
		return;

	if (!pmd_present(*pmd))
		return;

	// large pages
	if (pmd_large(*pmd)) {

		if ((pmd_val(*pmd) & _PAGE_DIRTY) == 0)
			return;

		mm_track_pmd(pmd);

		pmd_val(*pmd) &= ~_PAGE_DIRTY;
		pmd_val(*pmd) |= _PAGE_SOFTDIRTY;

		return;
	}

	if (addr < PAGE_OFFSET)
		pte = pte_offset_map(pmd, addr);
	else
		pte = pte_offset_kernel(pmd, addr);

	DBGPRNT(DBGLVL_HARVEST, "            page table %lx\n",
		pmd_val(*pmd) & PAGE_MASK);
	DBGPRNT(DBGLVL_HARVEST, "            pte[%ld] %p is %lx\n",
		pte_index(addr), pte, pte_val(*pte));

	if (pte_none(*pte) || !pte_present(*pte))
		goto out_unmap_pte;

	if (!(pte_val(*pte) & _PAGE_DIRTY))
		goto out_unmap_pte;

	mm_track_pte(pte);

	pte_val(*pte) &= ~_PAGE_DIRTY;
	pte_val(*pte) |= _PAGE_SOFTDIRTY;

	DBGPRNT(DBGLVL_HARVEST, "a           pte[%ld] %p is %lx\n",
		pte_index(addr), pte, pte_val(*pte));

out_unmap_pte:
	if (addr < PAGE_OFFSET)
		pte_unmap(pte);

	return;
}
#endif

static pgd_t *get_pgd(unsigned long address) {
#if 0
        pgd_t *pgd;

        asm volatile("movq %%cr3,%0" : "=r" (pgd));

        pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
        pgd += pgd_index(address);
        if (!pgd_present(*pgd)) {
		printk("%s-KAM: pgd not present (%p)\n", __func__,
			(void*)(pgd_val(*pgd)));
		return NULL;
	}

	if (pgd != init_level4_pgt)
		printk("%s: got %p, but init_level4_pgt is %p\n", __func__,
			pgd, init_level4_pgt);

	return pgd;
#else
	return init_level4_pgt;
#endif
}

/*
 * Traverse the kernel page tables, but don't use the
 * loop-over-all-kernel-address technique, as that
 * takes one fourth of forever on a 64-bit machine.
 * Instead, iterate over the page tables themselves,
 * starting with the pgd table.  This will be a quadruple-
 * nested loop,  relying on p[ml4/gd/md/te]_none checks
 * for early-outs in the loops.
 * Be nice to find a useful way to SMP-ify this traversal,
 * but so far haven't found a way.
 */
static void harvest_kernel(void)
{
	unsigned long n, i, j, k;

	pgd_t *pgd_0 = get_pgd(0);
	pgd_t *pgd = get_pgd(PAGE_OFFSET);

	if (!pgd_0) {
		printk("%s: null pgd_0\n", __func__);
		return;
	}

	if (!pgd) {
		printk("%s: null pgd\n", __func__);
		return;
	}

	// Skip addrs < PAGE_OFFSET
	for (n = pgd - pgd_0; n < PTRS_PER_PGD; n++, pgd++) {
		pud_t *pud;

		if (pgd_none(*pgd) || !pgd_present(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (!pud) {
			printk("%s: null pud?\n", __func__);
			return;
		}

		mm_track_phys((void*)virt_to_phys(pud));
		
		for (i = 0; i < PTRS_PER_PUD; i++, pud++) {

			pmd_t *pmd;

			if (pud_none(*pud) || !pud_present(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			for (j = 0; j < PTRS_PER_PMD; j++, pmd++) {
				pte_t *pte;
				pmd_t tmp_pmd = *pmd;
				unsigned long addr;

				addr = n << PGDIR_SHIFT;
				addr += i << PUD_SHIFT;
				addr += j << PMD_SHIFT;

				if (pmd_none(tmp_pmd) || !pmd_present(tmp_pmd))
					continue;

				if (pmd_large(tmp_pmd)) {
					if (pmd_val(tmp_pmd) & _PAGE_DIRTY) {
						pmd_val(tmp_pmd) &= ~_PAGE_DIRTY;
						pmd_val(tmp_pmd) |= _PAGE_SOFTDIRTY;
						mm_track_pmd(pmd);
					}
				}

				if (pmd_val(tmp_pmd) != pmd_val(*pmd)) {
					*pmd = tmp_pmd;
					if (pmd_val(tmp_pmd) & _PAGE_GLOBAL)
						__flush_tlb_global();
					else
						__flush_tlb_one(addr);
					mm_track_phys((void*)virt_to_phys(pmd));
				}

				if (pmd_large(tmp_pmd))
					continue;

				pte = pte_offset_kernel(pmd, 0);

				for (k = 0; k < PTRS_PER_PTE; k++, pte++) {
					unsigned long addr_pte = addr + (k << PAGE_SHIFT);
					pte_t tmp_pte = *pte;

					if (pte_none(tmp_pte) || !pte_present(tmp_pte))
						continue;

					if (pte_val(tmp_pte) & _PAGE_DIRTY) {
						pte_val(tmp_pte) &= ~_PAGE_DIRTY;
						pte_val(tmp_pte) |= _PAGE_SOFTDIRTY;
						mm_track_pte(pte);
					}

					if (pte_val(tmp_pte) != pte_val(*pte)) {
						*pte = tmp_pte;
						if (pte_val(tmp_pte) & _PAGE_GLOBAL)
							__flush_tlb_global();
						else
							__flush_tlb_one(addr_pte);
						mm_track_phys((void*)virt_to_phys(pte));
					}
				}
			}
		}
	}
}

static unsigned long map_kernel_small_pmd(void * __attribute__ ((unused)) unused_arg)
{
#ifdef USE_SMALL_PMD
	small_pmd_enter();
#endif
	return 0;
}

static unsigned long map_kernel_orig_pmd(void * __attribute__ ((unused)) unused_arg)
{
#ifdef USE_SMALL_PMD
	small_pmd_leave();
#endif
	return 0;
}

#ifdef USE_TASK_HARVEST
static void harvest_mm(struct mm_struct *mm) {
	struct vm_area_struct * mmap, * vma;

	vma = mmap = mm->mmap;
	if (!mmap)
		return;

	do {
		unsigned long start, end;

		if (!vma)
			break;

		start = vma->vm_start;
		end   = vma->vm_end;

#ifdef HARVEST_BY_TABLE
		harvest_table(mm->pgd, start, end);
#else
		for (addr = start; addr <= end; addr += PAGE_SIZE)
			harvest_page(mm->pgd, addr);
#endif

	} while ((vma = vma->vm_next) != mmap);
}

static void harvest_user(void)
{
	struct task_struct *p;
	int task_count = 0;
	int this_cpu = smp_processor_id();
	int num_cpus = num_online_cpus();

	for_each_process(p) {
		struct mm_struct *mm = p->mm;

#ifdef SMP_HARVEST
		if (this_cpu != (task_count++ % num_cpus))
			continue;
#endif

		if (!mm)
			continue;

		harvest_mm(mm);
	}
}
#else
static void harvest_user(void)
{
	struct list_head * m = NULL;
	struct mm_struct * mmentry;
	struct vm_area_struct * mmap, * vma;
#ifndef HARVEST_BY_TABLE
	unsigned long addr;
#endif
	int num_cpus = num_online_cpus();
	int this_cpu = smp_processor_id();
	int pgd_count = 0;

	m = &init_mm.mmlist;

	do {
		mmentry = list_entry(m, struct mm_struct, mmlist);
		m = m->next;

		vma = mmap = mmentry->mmap;
		if (!mmap)
			continue;

#ifdef SMP_HARVEST
		if (this_cpu != (pgd_count++ % num_cpus))
			continue;
#endif

		do {
			unsigned long start, end;

			if (!vma)
				break;

			start = vma->vm_start;
			end   = vma->vm_end;

#ifdef HARVEST_BY_TABLE
			harvest_table(mmentry->pgd, start, end);
#else
			for (addr = start; addr <= end; addr += PAGE_SIZE)
				harvest_page(mmentry->pgd, addr);
#endif

		} while ((vma = vma->vm_next) != mmap);

	} while (m != &init_mm.mmlist);
}
#endif

#define VIRT_TO_PFN(pfn) (virt_to_phys(pfn) >> PAGE_SHIFT)

#define TEST_BIT(x, pfn, vector)				\
({								\
	int retval;						\
								\
	if (x->blackout)					\
		retval = test_bit(pfn, vector);			\
	else							\
		retval = test_and_clear_bit(pfn, vector);	\
								\
	retval;							\
})

static void process_vector(void * arg)
{
	struct syncd_data * x = (struct syncd_data *) arg;
	unsigned long pfn;
	unsigned long start_pfn, runlength;

	// traverse the the bitmap looking for dirty pages
	runlength = 0;
	start_pfn = 0;

	for (pfn = 0; pfn < mm_tracking_struct.bitcnt; pfn++) {
		if (TEST_BIT(x, pfn, mm_tracking_struct.vector)) {
			if (!(x->blackout))
				atomic_dec(&mm_tracking_struct.count);
		    	if (x->is_pfn_valid(pfn)) {
				if (runlength == 0) {
					start_pfn = pfn;
				}
				runlength++;
			} else {
				if (runlength) {
					x->status = x->copy(start_pfn, runlength,
							    x->blackout, x->context);
					if (x->status != FOSIL_ERR_OK) {
						printk("%s: %d error (%d), start_pfn %lx, runlength %lx\n",
							__FUNCTION__, __LINE__,
							x->status,
							start_pfn,
							runlength);
						return;
					}
				}
				start_pfn = runlength = 0;
			}

		} else if (runlength) {
			x->status = x->copy(start_pfn, runlength,
					    x->blackout, x->context);
			if (x->status != FOSIL_ERR_OK) {
				printk("%s: %d error (%d), start_pfn %lx, runlength %lx\n",
					__FUNCTION__, __LINE__,
					x->status,
					start_pfn,
					runlength);
				return;
			}
			start_pfn = runlength = 0;
		}
	}

	// Catch the last ones.
	if (runlength) {
		x->status = x->copy(start_pfn, runlength,
				    x->blackout, x->context);
		if (x->status != FOSIL_ERR_OK) {
			printk("%s: %d\n", __FUNCTION__, __LINE__);
			return;
		}
	}
}

static void enable_tracking(void)
{
	// initialize tracking structure
	atomic_set(&mm_tracking_struct.count, 0);
	memset(mm_tracking_struct.vector, 0, mm_tracking_struct.bitcnt/8);

        // turn on tracking
        mm_tracking_struct.active = 1;
}

static void disable_tracking(void)
{
	// turn off tracking
	mm_tracking_struct.active = 0;
}

/*
 * Difference or zero
 *
 *  d <- (x - y), x >= y
 *	    0	, x < y
 */
unsigned long
doz(unsigned long x, unsigned long y)
{
	if (x < y)
		return 0;

	return x - y;
}

/*
 * Merge closest two regions to free up one slot in table.
 */
int
merge(struct syncd_data * x)
{
	int i, ii;
	unsigned long delta, ndelta;
	unsigned long end;
	fosil_page_range_t * cr = x->CopyRange;

	i = 0;
	ii = ~0;
	delta = ~0UL;
	do {
		end = cr[i].start + cr[i].num - 1;
		ndelta = cr[i+1].start - end;

		if (!x->is_pfnrange_valid(end, cr[i+1].start))
			continue;

		if (ndelta <= delta) {
			delta = ndelta;
			ii = i;
		}
	} while (i++ < (x->max_range - 1));

	if (ii == ~0)
		return 1;

	cr[ii].num = (cr[ii+1].start + cr[ii+1].num) - cr[ii].start;

	// shift everyone down
	x->max_range--;
	for (i = ii + 1; i < x->max_range; i++) {
		cr[i].start = cr[i+1].start;
		cr[i].num   = cr[i+1].num;
	}

	return 0;
}

int
insert_pfn(struct syncd_data * x, unsigned long pfn)
{
	int i;
	int ii;
	fosil_page_range_t * cr = x->CopyRange;
	int nearby = x->nearby;

	if (!x->is_pfn_valid(pfn))
		return 0;
 restart:
	for (i = 0; i < x->max_range; i++) {
		unsigned long start = cr[i].start;
		unsigned long len   = cr[i].num;
		unsigned long end   = start + len - 1;

		// inside current region	start <= pfn <= end
		if ((pfn - start) <= (len - 1))
			return 0;

		// adjacent to start		start-nearby <= pfn <= start
		if ((pfn - doz(start, nearby)) <= start - doz(start, nearby)) {
			cr[i].num  += (cr[i].start - pfn);
			cr[i].start = pfn;
			return 0;
		}

		// adjacent to end		end <= pfn <= end+nearby
		if ((pfn - end) <= nearby) {
			cr[i].num += pfn - end;
			end += pfn - end;
 again:
			if (i+1 == x->max_range)
				return 0;

			// this range may overlap or abut next range
			// after insert		(next.start - nearby) <= end
			if (doz(cr[i+1].start, nearby) <= end) {
				cr[i].num = (cr[i+1].start + cr[i+1].num)
					- cr[i].start;
				end = cr[i].start + cr[i].num - 1;

				// shift everyone down
				x->max_range--;
				for (ii = i + 1; ii < x->max_range; ii++) {
					cr[ii].start = cr[ii+1].start;
					cr[ii].num   = cr[ii+1].num;
				}
				goto again;
			}

			return 0;
		}

		// before region	pfn < start-nearby	- OR -
		//			!at end of array	- AND -
		// between regions	end+nearby < pfn < next.start-nearby
		if ((pfn < doz(start, nearby)) ||
			((i+1 == x->max_range) &&
			((end+nearby < pfn) &&
			 (pfn < doz(cr[i+1].start, nearby))))) {
			if (x->max_range == MAX_COPYRANGE) {
				if (merge(x))
					return 1;
				goto restart;
			}
			// shift everyone up to make room
			for (ii = x->max_range; ii > i; ii--) {
				cr[ii].start = cr[ii-1].start;
				cr[ii].num   = cr[ii-1].num;
			}
			if (pfn < doz(start, nearby))
				ii = i;		// before region
			else
				ii = i + 1;	// between regions
			cr[ii].start = pfn;
			cr[ii].num = 1;
			x->max_range++;
			return 0;
		}
	}

	if (x->max_range == MAX_COPYRANGE)
		if (merge(x))
			return 1;

	// append to array
	cr[x->max_range].start = pfn;
	cr[x->max_range].num = 1;
	x->max_range++;

	return 0;
}

/*
 * x86_64 stacks are different from i386 in that you don't get
 * there by adding 2 pages to "current".  Instead, we'll put
 * a dummy variable on the stack and look at its address.
 */
static int add_stack_pages(struct syncd_data * x) {
	unsigned long dummy;
	unsigned long *stack = &dummy;
	unsigned long pfn, num_pfns;
	int rc = 0;

	unsigned long i;
	pfn = (unsigned long)(virt_to_phys(stack) & ~(THREAD_SIZE - 1));
	pfn >>= PAGE_SHIFT;
	num_pfns = (THREAD_SIZE + (PAGE_SIZE - 1)) >> PAGE_SHIFT;

	for (i = 0; i < num_pfns; i++, pfn++) {
		rc = insert_pfn(x, pfn);
		if (rc)
			return rc;
	}

	return 0;
}

static unsigned long virt_to_pfn(void *addr) {
	unsigned long phys = virt_to_phys(addr);
	return phys >> PAGE_SHIFT;
}

/*
 * ftmod dirties a page for OS_DEBUG prints (ringbuffer).
 * Provide that page here, so we can track it, as it will be
 * dirtied on the way to the sync point.
 */
void *fosil_scratch_page;
EXPORT_SYMBOL_GPL(fosil_scratch_page);
static spinlock_t *fosil_scratch_lock;
EXPORT_SYMBOL_GPL(fosil_scratch_lock);


/*
 * With a fundamental knowledge of what absolutely MUST be copied
 * by the HOST during blackout, add pfn's to the copy range list.
 *
 * All page tables are added to avoid having to deal with optimizing
 * which ptes might be set dirty or accessed by the last few operations
 * before blackout.
 *
 * After page tables, the remainder of the pages are our known "working
 * set".  This includes the stack frames for this task and processor and
 * the syncd structure (which we are making dirty by compiling this list).
 *
 * Finally, all pages marked as dirty in their pte's are added to the
 * list.  These are pages which escaped the harvest/process cycles and
 * were presumably dirtied between harvest and here.
 */
static int setup_blackout_copylist(struct syncd_data * x)
{
	pgd_t *pgd;

	int i, j, k;
	int n;

	x->max_range = 0;

	pgd = get_pgd(0);

	if (!pgd) {
		printk("%s: no pgd?\n", __func__);
		return FOSIL_ERR_MERGE_FAIL;
	}

	if (insert_pfn(x, virt_to_pfn(pgd)))
		return FOSIL_ERR_MERGE_FAIL;

	for (n = 0; n < PTRS_PER_PGD; n++, pgd++) {

		pud_t *pud;

		if (pgd_none(*pgd) || !pgd_present(*pgd))
			continue;

		pud = pud_offset(pgd, 0);

		if (insert_pfn(x, virt_to_pfn(pud)))
			return FOSIL_ERR_MERGE_FAIL;

		for (i = 0; i < PTRS_PER_PUD; i++, pud++) {

			pmd_t *pmd;

			if (pud_none(*pud) || !pud_present(*pud))
				continue;

			pmd = pmd_offset(pud, 0);

			if (insert_pfn(x, virt_to_pfn(pmd)))
				return FOSIL_ERR_MERGE_FAIL;

			for (j = 0; j < PTRS_PER_PMD; j++, pmd++) {

				pte_t *pte;

				if (pmd_none(*pmd) || !pmd_present(*pmd))
					continue;

				if (pmd_large(*pmd)) {
					/* Skip it if it covers an invalid area */
					if (!x->is_pfn_valid(pmd_pfn(*pmd)))
						continue;

					if ((pmd_val(*pmd) & _PAGE_DIRTY)) {
						unsigned long p, pfn = pmd_pfn(*pmd);
						for (p = 0; p < 0x200; p++) {
							if (insert_pfn(x, pfn + p))
								return FOSIL_ERR_MERGE_FAIL;
						}	
					}
					continue;
				}

				pte = pte_offset_kernel(pmd, 0);

				if (insert_pfn(x, virt_to_pfn(pte)))
					return FOSIL_ERR_MERGE_FAIL;

				for (k = 0; k < PTRS_PER_PTE; k++, pte++) {

					if (pte_none(*pte) || !pte_present(*pte))
						continue;

					/* Skip it if it covers an invalid area */
					if (!x->is_pfn_valid(pte_pfn(*pte)))
						continue;

					if (pte_val(*pte) & _PAGE_DIRTY) {
						if (insert_pfn(x, pte_pfn(*pte)))
							return FOSIL_ERR_MERGE_FAIL;
					}
				}
			}
		}
	}

	// copy the stack we are working from 
	if (add_stack_pages(x))
		return FOSIL_ERR_MERGE_FAIL;

	// add syncd structure
	if (insert_pfn(x, virt_to_pfn(x)))
		return FOSIL_ERR_MERGE_FAIL;

	// add mm_tracking_struct (disable_tracking() dirtied it).
	if (insert_pfn(x, virt_to_pfn(&mm_tracking_struct)))
		return FOSIL_ERR_MERGE_FAIL;

	// add the scratch page, in case ftmod wants to dirty it on the way to sync.
	if (insert_pfn(x, virt_to_pfn(fosil_scratch_page)))
		return FOSIL_ERR_MERGE_FAIL;

	if (insert_pfn(x, virt_to_pfn(fosil_scratch_lock)))
		return FOSIL_ERR_MERGE_FAIL;

	return FOSIL_ERR_OK;
}

static void __attribute__ ((__unused__)) do_blackout(void * arg)
{
	struct syncd_data * x = (struct syncd_data *) arg;
	fosil_page_range_t ranges[1] = {{0, ~0}};

	x->status = x->sync(1, ranges, &x->duration_in_ms, x->context);
}

static unsigned long harvest(void * arg)
{
#ifdef SMP_HARVEST
	struct syncd_data * x = (struct syncd_data *) arg;
#else
	if (smp_processor_id() != 0)
		return 0;
#endif

	harvest_user();

#ifdef SMP_HARVEST
	rendezvous(&x->r);

	if (smp_processor_id() == 0)
		harvest_kernel();
#else
	harvest_kernel();
#endif

	return 0;
}

/*
 * This is where all processors are sent for blackout processing.  
 * The driving processor is diverted to work on harvest and if necessary
 * entry to Common Code for synchronization.  All other processors are
 * spinning in a rendezvous at the bottom of this routine waiting for
 * the driving processor to finish.
 */
static unsigned long brownout_cycle(void * arg)
{
	struct syncd_data * x = (struct syncd_data *) arg;
	unsigned long long before, after;
	unsigned long delta;

#ifndef SMP_HARVEST
	if (smp_processor_id() != 0)
		return 0;
#endif

	rdtscll(before);

	harvest_user();

#ifdef SMP_HARVEST
	rendezvous(&x->r);

	if (smp_processor_id() != 0)
		return 0;
#endif

	harvest_kernel();

	x->bitcnt = atomic_read(&mm_tracking_struct.count);

	if ((x->bitcnt < x->threshold) || (x->pass == 0)) {
		x->blackout = IN_BLACKOUT;

		disable_tracking();

		x->status = setup_blackout_copylist(x);
		if (x->status)
			return 0;

		process_vector(x);
		if (x->status)
			return FOSIL_ERR_COPY_FAIL;

		rdtscll(after);

		x->status = x->sync(x->max_range,
				    x->CopyRange,
				    &x->duration_in_ms, x->context);
		x->done = 1;

		if (x->status) {
			printk("%s: %d (%s)\n", __FUNCTION__, __LINE__,
			       FOSIL_ERR_STRING(x->status));
		}

		delta = after - before;
		return x->duration_in_ms + (delta / cpu_khz);
	}

	return 0;
}

static void roll_time_forward(unsigned long adjust_in_ms)
{
#if 0	// FIXME
	struct timeval tv;
	struct timespec ts;

	do_gettimeofday(&tv);

	ts.tv_sec = tv.tv_sec;
	ts.tv_nsec = tv.tv_usec + adjust_in_ms * USEC_PER_SEC;
	while(unlikely(ts.tv_nsec >= NSEC_PER_SEC)) {
		ts.tv_nsec -= NSEC_PER_SEC;
		++ts.tv_sec;
	}
	do_settimeofday(&ts);
#endif
}

struct blackout_data {
	atomic_t r[5];
	unsigned int oncpu;
	unsigned long (*func)(void *);
	void * arg;
};

static void fosil_blackout(void * arg)
{
	struct blackout_data * x = (struct blackout_data *) arg;
	unsigned long flags;
	unsigned long long before, after;
	unsigned long delta, adjust;
	int this_cpu = smp_processor_id();

	rendezvous(&x->r[0]);
	local_bh_disable();
	rendezvous(&x->r[1]);
	local_irq_save(flags);
	rendezvous(&x->r[2]);

	rdtscll(before);
	adjust = x->func(x->arg);
	rdtscll(after);

	if (adjust) {
		delta = adjust;
	} else {
		delta = after - before;
		delta /= cpu_khz;
	}

	rendezvous(&x->r[3]);

	if (this_cpu == x->oncpu)
		roll_time_forward(delta);

	rendezvous(&x->r[4]);

	local_irq_restore(flags);
	local_bh_enable();
}


/*
 * Call the specified function with the provide argument in
 * during a blackout.
 */
void fosil_exec_onall(unsigned long (*func)(void *), void * arg)
{
	struct blackout_data x;

	init_rendezvous(&x.r[0]);
	init_rendezvous(&x.r[1]);
	init_rendezvous(&x.r[2]);
	init_rendezvous(&x.r[3]);
	init_rendezvous(&x.r[4]);
	x.oncpu = smp_processor_id();
	x.func  = func;
	x.arg   = arg;

	fosil_syncd_start(SYNCD_ON_ALL_PROCESSORS, fosil_blackout, &x, 0);
	fosil_syncd_finish();
}

/*
 * This routine is responsible for driving all of the memory
 * synchronization processing.  When complete the sync function should
 * be called to enter blackout and finalize synchronization.
 */
int fosil_synchronize_memory(copy_proc copy,
			     sync_proc sync,
			     is_pfn_valid_proc is_pfn_valid,
			     is_pfnrange_valid_proc is_pfnrange_valid,
			     unsigned int threshold,
			     unsigned int passes,
			     void * context)
{
	int status;
	struct syncd_data * x;

	x = kmalloc(sizeof(struct syncd_data), GFP_KERNEL);
	if (x == NULL) {
		printk("unable to allocate syncd control structure\n");
		return 1;
	}

	memset(x, 0, sizeof(struct syncd_data));

	x->copy = copy;
	x->sync = sync;
	x->is_pfn_valid = is_pfn_valid;
	x->is_pfnrange_valid = is_pfnrange_valid;
	x->bitcnt = 0;
	x->blackout = IN_BROWNOUT;
	x->done = 0;
	x->threshold = threshold;
	x->context = context;
	x->nearby = NEARBY;

	fosil_exec_onall(map_kernel_small_pmd, NULL);

	// clean up hardware dirty bits
	init_rendezvous(&x->r);
	fosil_exec_onall(harvest, x);

	enable_tracking();

	/*
	 * Copy all memory.  After this we only copy modified
	 * pages, so we need to make sure the unchanged parts
	 * get copied at least once.
	 */
	status = x->copy(0, ~0, 0, x->context);
	if (status != FOSIL_ERR_OK) {
		printk("%s: %d\n", __FUNCTION__, __LINE__);
		goto fosil_sync_out;
	}

	for (x->pass = passes; (x->pass >= 0); x->pass--) {
		DBGPRNT(DBGLVL_BROWNOUT,
			"brownout pass %d - %ld pages found (%d)\n",
			x->pass, x->bitcnt,
			atomic_read(&mm_tracking_struct.count));

		init_rendezvous(&x->r);
		fosil_exec_onall(brownout_cycle, x);
		if (x->status) {
			printk("%s(%d): brownout_cycle failed \n",
			       __FUNCTION__, __LINE__);
			// flush any incomplete transactions
			x->copy(~0, ~0, 0, x->context);
			break;
		}
		if (x->done)
			break;

		/* Give back some time to everybody else */
		schedule();

		process_vector(x);
		if (x->status) {
			printk("%s(%d): process_vector failed \n",
			       __FUNCTION__, __LINE__);
			// flush any incomplete transactions
			x->copy(~0, ~0, 0, x->context);
			break;
		}
	}

	disable_tracking();
#if 0
	{
		int i;
		int total = 0;
		for (i = 0; i < x->max_range; i++) {
			printk("start %08lx num %lx(%ld) gap [%ld]\n",
				x->CopyRange[i].start,
				x->CopyRange[i].num,
				x->CopyRange[i].num,
				(i==0)?0:(x->CopyRange[i].start -
					  (x->CopyRange[i-1].start +
					   x->CopyRange[i-1].num)));
			total += x->CopyRange[i].num;
		}
		printk("max range %d total pages %d\n",
			x->max_range, total);
	}
#endif
	DBGPRNT(DBGLVL_BROWNOUT, "last pass found %ld pages\n", x->bitcnt);
	DBGPRNT(DBGLVL_BROWNOUT, "copy range max %d\n", x->max_range);

	status = x->status;

fosil_sync_out:

	kfree(x);

	fosil_exec_onall(map_kernel_orig_pmd, NULL);

	return status;
}
EXPORT_SYMBOL(fosil_synchronize_memory);

#ifdef REPLACE_KERN_FUNCS
static void (*orig_track_pmd)(void *);

static int check_old_ptes(pmd_t *pmd, int *u_count, int *k_count) {
	int i, lost_pte = 0;
	unsigned long pfn;
	pte_t *pte = pte_offset_kernel(pmd, 0);

	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
		if (pte_none(*pte) || !pte_present(*pte))
			continue;

		pfn = pte_pfn(*pte);
		
		if (pfn >= mm_tracking_struct.bitcnt)
			continue;

		if (!test_and_set_bit(pfn, mm_tracking_struct.vector)) {
			if (pte_val(*pte) & _PAGE_USER)
				(*u_count)++;
			else
				(*k_count)++;
			atomic_inc(&mm_tracking_struct.count);
			lost_pte++;
		}
	}

	return lost_pte;
}

static void fosil_track_pmd(void * val) {
	pmd_t *pmd = (pmd_t*)val;
	int u_count = 0, k_count = 0;

	if (pmd_large(*pmd)) {
		orig_track_pmd(val);
		return;
	}

	check_old_ptes(pmd, &u_count, &k_count);

	orig_track_pmd(val);
}
#endif

int __init fosil_harvest_init(void)
{
	extern unsigned long num_physpages;

#ifdef USE_SMALL_PMD
	if (small_pmd_init())
		goto out_small_pmd;
#endif

#ifdef REPLACE_KERN_FUNCS
	orig_track_pmd = do_mm_track_pmd;
	do_mm_track_pmd = fosil_track_pmd;
#endif

	mm_tracking_struct.vector = vmalloc(num_physpages/8);
	mm_tracking_struct.bitcnt = num_physpages;

	if (mm_tracking_struct.vector == NULL) {
		printk("%s: unable to allocate memory tracking bitmap\n",
		       __FUNCTION__);
		goto out_no_mm_track;
	}

	fosil_scratch_page = (void*)__get_free_pages(GFP_KERNEL, 0);
	if (!fosil_scratch_page)
		goto out_no_fsp;

	fosil_scratch_lock = (spinlock_t*)kmalloc(sizeof(spinlock_t), GFP_KERNEL);
	if (!fosil_scratch_lock)
		goto out_no_fsl;

	spin_lock_init(fosil_scratch_lock);

	return 0;

out_no_fsl:
	free_pages((unsigned long)fosil_scratch_page, 0);
out_no_fsp:
	vfree(mm_tracking_struct.vector);
out_no_mm_track:
#ifdef USE_SMALL_PMD
	small_pmd_exit();
out_small_pmd:
#endif
	return -ENOMEM;
}

void __exit fosil_harvest_cleanup(void)
{
#ifdef USE_SMALL_PMD
	small_pmd_exit();
#endif

#ifdef REPLACE_KERN_FUNCS
	do_mm_track_pmd = orig_track_pmd;
#endif

	// make sure tracking is definitely turned off
	disable_tracking();

	// wake me after everyone is sure to be done with mm_track()
	synchronize_sched();

	if (mm_tracking_struct.vector)
		vfree(mm_tracking_struct.vector);

	if (fosil_scratch_page)
		free_pages((unsigned long)fosil_scratch_page, 0);

	if (fosil_scratch_lock)
		kfree(fosil_scratch_lock);

	mm_tracking_struct.bitcnt = 0;

	return;
}

#else /* CONFIG_TRACK_DIRTY_PAGES */


int __init fosil_harvest_init(void)
{
	return 0;
}

void __exit fosil_harvest_cleanup(void)
{
	return;
}

#endif /* CONFIG_TRACK_DIRTY_PAGES */