lists.openwall.net | lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC | |
Open Source and information security mailing list archives
| ||
|
Date: Fri, 31 Jul 2009 21:01:13 -0400 From: Jim Paradis <jparadis@...hat.com> To: linux-mm@...ck.org CC: linux-kernel@...r.kernel.org Subject: [PATCH 2/2] Dirty page tracking & on-the-fly memory mirroring This patch is a reference implementation of a memory-mirroring module ("pagesync"). It is the same code that Stratus uses minus some hardware-specific bits. This module scans through physical memory, clearing the hardware dirty bit of any dirty page and setting the software dirty bit. If a dirty page has the *hardware* dirty bit set on a subsequent scan, we know that the page has been re-dirtied and it is a candidate for being copied again. This code is invoked through this API: struct pagesync_page_range { unsigned int start; unsigned int num; } typedef int (*copy_proc)(unsigned long start_pfn, unsigned long count, int blackout, void *context); typedef int (*sync_proc)(int max_range, struct pagesync_page_range *copy_range, void *context); int pagesync_synchronize_memory(copy_proc copy, sync_proc sync, unsigned int threshold, unsigned int passes, void *context) The "copy" and "sync" functions are system-specific and supplied by the caller. The "copy" function copies a range of memory from the current node to the destination node, while the "sync" function does final cleanups and cuts over to the new node. Kconfig | 12 mm/Makefile | 4 mm/init_32.c | 2 mm/pagesync.h | 66 ++ mm/pagesync_harvest.c | 1124 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/pagesync_harvest.h | 69 +++ mm/pagesync_syncd.c | 259 +++++++++++ mm/pagesync_syncd.h | 185 ++++++++ 8 files changed, 1721 insertions(+) Signed-off-by: jparadis@...hat.com diff -up linux-2.6-git-track/arch/x86/Kconfig.harvest linux-2.6-git-track/arch/x86/Kconfig --- linux-2.6-git-track/arch/x86/Kconfig.harvest 2009-07-21 13:52:56.000000000 -0400 +++ linux-2.6-git-track/arch/x86/Kconfig 2009-07-28 16:54:36.000000000 -0400 @@ -1138,6 +1138,18 @@ config TRACK_DIRTY_PAGES memory to another system or node. Most users will say n here. +config PAGESYNC_HARVEST + tristate "Enable reference implementation of pagesync harvest code" + default n + depends on TRACK_DIRTY_PAGES && EXPERIMENTAL + ---help--- + Turning this on builds a reference implementation of + a page-harvesting driver. This driver makes use of + dirty page tracking to allow on-the-fly mirroring of + system memory. This could be useful, for example, for + failing over to a spare module in a cluster configuration. + Most users will say n here. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff -up linux-2.6-git-track/arch/x86/mm/init_32.c.harvest linux-2.6-git-track/arch/x86/mm/init_32.c --- linux-2.6-git-track/arch/x86/mm/init_32.c.harvest 2009-07-21 13:46:54.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/init_32.c 2009-07-21 18:30:57.000000000 -0400 @@ -1075,3 +1075,5 @@ int __init reserve_bootmem_generic(unsig { return reserve_bootmem(phys, len, flags); } + +EXPORT_SYMBOL_GPL(swapper_pg_dir); diff -up linux-2.6-git-track/arch/x86/mm/Makefile.harvest linux-2.6-git-track/arch/x86/mm/Makefile --- linux-2.6-git-track/arch/x86/mm/Makefile.harvest 2009-07-21 13:52:56.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/Makefile 2009-07-21 18:37:41.000000000 -0400 @@ -21,4 +21,8 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64. obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_TRACK_DIRTY_PAGES) += track.o +obj-$(CONFIG_PAGESYNC_HARVEST) += pagesync.o + +pagesync-objs := pagesync_harvest.o pagesync_syncd.o + obj-$(CONFIG_MEMTEST) += memtest.o diff -up linux-2.6-git-track/arch/x86/mm/pagesync_harvest.c.harvest linux-2.6-git-track/arch/x86/mm/pagesync_harvest.c --- linux-2.6-git-track/arch/x86/mm/pagesync_harvest.c.harvest 2009-07-21 18:30:57.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/pagesync_harvest.c 2009-07-28 12:30:58.000000000 -0400 @@ -0,0 +1,1123 @@ +/* Low-level routines for memory mirroring. Tracks dirty pages and + * utilizes provided copy routine to transfer pages which have changed + * between harvest passes. + * + * Copyright (C) 2006, 2007, 2009 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include "pagesync_harvest.h" +#include "pagesync_syncd.h" +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <asm/io.h> +#include <asm/mm_track.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> + + +#ifdef CONFIG_TRACK_DIRTY_PAGES + +#if PAGETABLE_LEVELS <= 3 +static inline unsigned pud_index(unsigned long address) +{ + return 0; +} +#endif + +#if PAGETABLE_LEVELS < 3 +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} +#endif + + + +#define DBGPRNT(lvl, args...) \ +({ \ + if (lvl & dbgmask) \ + printk(args); \ +}) + +enum { + DBGLVL_NONE = 0x00, + DBGLVL_HARVEST = 0x01, + DBGLVL_BROWNOUT = 0x02, + DBGLVL_TIMER = 0x04, +}; + +static int dbgmask = 0xff; + +/* + * Caller dirties a page for OS_DEBUG prints (ringbuffer). + * Provide that page here, so we can track it, as it will be + * dirtied on the way to the sync point. + */ +void *pagesync_scratch_page; +EXPORT_SYMBOL(pagesync_scratch_page); +static spinlock_t *pagesync_scratch_lock; +EXPORT_SYMBOL(pagesync_scratch_lock); + +#define MAX_COPYRANGE 100 /* this is actually defined in cc/host.h */ +#define NEARBY 6 +enum { + IN_BROWNOUT = 0, + IN_BLACKOUT = 1, +}; + +struct pagesync_data { + unsigned long flags[NR_CPUS]; + copy_proc copy; + sync_proc sync; + void *context; + int status; /* return status from command inside blackout */ + int max_range; /* # of valid ranges in following CopyRange */ + struct pagesync_page_range CopyRange[MAX_COPYRANGE]; + unsigned long nearby; /* insert merge tunable */ + unsigned int duration_in_ms; /* blackout during */ + unsigned long bitcnt; /* number of bits find in last harvest */ + int blackout; /* 1 == blackout, 0 == brownout */ + int pass; /* # of times through harvest/process cycle */ + int done; /* 1 == done - time to leave */ + unsigned int threshold; /* number of dirty pages before sync */ + atomic_t r; + atomic_t harvest_waiters; +}; + +struct blackout_data { + atomic_t r[5]; + unsigned int oncpu; + unsigned long (*func)(void *); + void *arg; +}; + +static unsigned long idx_to_va(unsigned long pgd_idx, + unsigned long pud_idx, + unsigned long pmd_idx, + unsigned long pte_idx) +{ + unsigned long va = + (pgd_idx << PGDIR_SHIFT) + + (pud_idx << PUD_SHIFT) + + (pmd_idx << PMD_SHIFT) + + (pte_idx << PAGE_SHIFT); + static const unsigned long hole_mask = + ~((1UL << (__VIRTUAL_MASK_SHIFT - 1)) - 1); + + if (va & hole_mask) { + /* Sign extend to canonical addr */ + va |= hole_mask; + } + return va; +} + +static void harvest_table(pgd_t *pgd_base, + unsigned long start, + unsigned long end) +{ + unsigned long i, j, k, l; + + pgd_t *pgd; + + unsigned long pgd_start, pgd_end; + unsigned long pud_start, pud_end; + unsigned long pmd_start, pmd_end; + unsigned long pte_start, pte_end; + + unsigned long pud_last; + unsigned long pmd_last; + unsigned long pte_last; + + /* + * end marks the first byte after the upper limit of the address + * range + */ + end--; + + if (!pgd_base) { + printk("%s: null pgd?\n", __func__); + return; + } + + pgd_start = pgd_index(start); + pgd_end = pgd_index(end); + + pud_start = pud_index(start); + pud_end = pud_index(end); + + pmd_start = pmd_index(start); + pmd_end = pmd_index(end); + + pte_start = pte_index(start); + pte_end = pte_index(end); + + pgd = pgd_base + pgd_start; + + for (i = pgd_start; i <= pgd_end; i++, pgd++) { + + pud_t *pud; + + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + + if (i == pgd_start) + j = pud_start; + else + j = 0; + + pud = pud_offset(pgd, 0) + j; + + if (i == pgd_end) + pud_last = pud_end; + else + pud_last = PTRS_PER_PUD - 1; + + for (; j <= pud_last; j++, pud++) { + pmd_t *pmd; + pud_t tmp_pud = *pud; + + if (pud_none(tmp_pud) || !pud_present(tmp_pud)) + continue; + + if ((i == pgd_start) && (j == pud_start)) + k = pmd_start; + else + k = 0; + + pmd = pmd_offset(pud, 0) + k; + + if ((i == pgd_end) && (j == pud_end)) + pmd_last = pmd_end; + else + pmd_last = PTRS_PER_PMD - 1; + + for (; k <= pmd_last; k++, pmd++) { + pte_t *pte; + pmd_t tmp_pmd = *pmd; + + if (pmd_none(tmp_pmd) || !pmd_present(tmp_pmd)) + continue; + + if (pmd_large(tmp_pmd)) { + /* + * Hardware may have flipped on the + * ACCESSED bit. If so, + * track the pmd page itself. + */ + if (pmd_val(tmp_pmd) & _PAGE_ACCESSED) { + set_pmd(&tmp_pmd, __pmd(pmd_val(tmp_pmd) & ~_PAGE_ACCESSED)); + mm_track_phys((void *) virt_to_phys(pmd)); + } + if (pmd_val(tmp_pmd) & _PAGE_DIRTY) + set_pmd(&tmp_pmd, __pmd((pmd_val(tmp_pmd) & ~_PAGE_DIRTY) | _PAGE_SOFTDIRTY)); + if (pmd_val(tmp_pmd) != pmd_val(*pmd)) { + set_pmd(pmd, tmp_pmd); + __flush_tlb_one(idx_to_va(i, j, k, 0)); + } + continue; + } + + if ((i == pgd_start) && (j == pud_start) && (k == pmd_start)) + l = pte_start; + else + l = 0; + + pte = pte_offset_kernel(pmd, 0) + l; + + if ((i == pgd_end) && (j == pud_end) && (k == pmd_end)) + pte_last = pte_end; + else + pte_last = PTRS_PER_PTE - 1; + + for (; l <= pte_last; l++, pte++) { + + pte_t tmp_pte = *pte; + + /* + * Hardware may have flipped on the ACCESSED bit. If so, + * track the pte page itself. + */ + if (pte_val(tmp_pte) & _PAGE_ACCESSED) { + set_pte(&tmp_pte, __pte(pte_val(tmp_pte) & ~_PAGE_ACCESSED)); + mm_track_phys((void *) virt_to_phys(pte)); + } + if (pte_val(tmp_pte) & _PAGE_DIRTY) + set_pte(&tmp_pte, __pte((pte_val(tmp_pte) & ~_PAGE_DIRTY) | _PAGE_SOFTDIRTY)); + + if (pte_val(tmp_pte) != pte_val(*pte)) { + set_pte(pte, tmp_pte); + __flush_tlb_one(idx_to_va(i, j, k, l)); + } + } + } + } + } +} + +static void harvest_mm(struct mm_struct *mm) +{ + struct vm_area_struct *mmap, *vma; + + vma = mmap = mm->mmap; + if (!mmap) + return; + + do { + unsigned long start, end; + + if (!vma) + break; + + start = vma->vm_start; + end = vma->vm_end; + + harvest_table(mm->pgd, start, end); + + } while ((vma = vma->vm_next) != mmap); +} + +static void harvest_user(void) +{ + struct task_struct *p; +#ifdef CONFIG_SMP + int task_count = 0; + int this_cpu = smp_processor_id(); + int num_cpus = num_online_cpus(); +#endif + + for_each_process(p) { + struct mm_struct *mm = p->mm; + +#ifdef CONFIG_SMP + if (this_cpu != (task_count++ % num_cpus)) + continue; +#endif + + if (!mm) + continue; + + harvest_mm(mm); + } +} + +static pgd_t *get_pgd(unsigned long address) +{ + pgd_t *pgd; + +#if PAGETABLE_LEVELS > 3 + pgd = &init_level4_pgt[0]; +#else + pgd = &swapper_pg_dir[0]; +#endif + pgd += pgd_index(address); + if (!pgd_present(*pgd)) + return NULL; + + return pgd; +} + +/* + * Difference or zero + * + * d <- (x - y), x >= y + * 0 , x < y + */ +static unsigned long +doz(unsigned long x, unsigned long y) +{ + if (x < y) + return 0; + + return x - y; +} + +/* + * Merge closest two regions to free up one slot in table. + */ +static int +merge(struct pagesync_data *x) +{ + int i, ii; + unsigned long delta, ndelta; + unsigned long end; + struct pagesync_page_range *cr = x->CopyRange; + + i = 0; + ii = ~0; + delta = ~0UL; + do { + end = cr[i].start + cr[i].num - 1; + ndelta = cr[i+1].start - end; + + if (ndelta <= delta) { + delta = ndelta; + ii = i; + } + } while (i++ < (x->max_range - 1)); + + if (ii == ~0) + return 1; + + cr[ii].num = (cr[ii+1].start + cr[ii+1].num) - cr[ii].start; + + /* shift everyone down */ + x->max_range--; + for (i = ii + 1; i < x->max_range; i++) { + cr[i].start = cr[i+1].start; + cr[i].num = cr[i+1].num; + } + + return 0; +} + +static int +insert_pfn(struct pagesync_data *x, unsigned long pfn) +{ + int i; + int ii; + struct pagesync_page_range *cr = x->CopyRange; + int nearby = x->nearby; + + restart: + for (i = 0; i < x->max_range; i++) { + unsigned long start = cr[i].start; + unsigned long len = cr[i].num; + unsigned long end = start + len - 1; + + /* inside current region: start <= pfn <= end */ + if ((pfn - start) <= (len - 1)) + return 0; + + /* adjacent to start start-nearby <= pfn <= start */ + if ((pfn - doz(start, nearby)) <= start - doz(start, nearby)) { + cr[i].num += (cr[i].start - pfn); + cr[i].start = pfn; + return 0; + } + + /* adjacent to end: end <= pfn <= end+nearby */ + if ((pfn - end) <= nearby) { + cr[i].num += pfn - end; + end += pfn - end; + again: + if (i+1 == x->max_range) + return 0; + + /* this range may overlap or abut next range */ + /* after insert (next.start - nearby) <= end */ + if (doz(cr[i+1].start, nearby) <= end) { + cr[i].num = (cr[i+1].start + cr[i+1].num) + - cr[i].start; + end = cr[i].start + cr[i].num - 1; + + /* shift everyone down */ + x->max_range--; + for (ii = i + 1; ii < x->max_range; ii++) { + cr[ii].start = cr[ii+1].start; + cr[ii].num = cr[ii+1].num; + } + goto again; + } + + return 0; + } + + /* before region pfn < start-nearby - OR - */ + /* !at end of array - AND - */ + /* between regions end+nearby < pfn < next.start-nearby */ + if ((pfn < doz(start, nearby)) || + ((i+1 == x->max_range) && + ((end+nearby < pfn) && + (pfn < doz(cr[i+1].start, nearby))))) { + if (x->max_range == MAX_COPYRANGE) { + if (merge(x)) + return 1; + goto restart; + } + /* shift everyone up to make room */ + for (ii = x->max_range; ii > i; ii--) { + cr[ii].start = cr[ii-1].start; + cr[ii].num = cr[ii-1].num; + } + if (pfn < doz(start, nearby)) + ii = i; /* before region */ + else + ii = i + 1; /* between regions */ + cr[ii].start = pfn; + cr[ii].num = 1; + x->max_range++; + return 0; + } + } + + if (x->max_range == MAX_COPYRANGE) + if (merge(x)) + return 1; + + /* append to array */ + cr[x->max_range].start = pfn; + cr[x->max_range].num = 1; + x->max_range++; + + return 0; +} + +static unsigned long virt_to_pfn(void *addr) +{ + unsigned long phys = virt_to_phys(addr); + return phys >> PAGE_SHIFT; +} + +static int add_stack_pages(struct pagesync_data *x, unsigned long *addr) +{ + unsigned long dummy; + unsigned long *stack; + unsigned long pfn, num_pfns; + int rc = 0; + unsigned long i; + + stack = addr != NULL ? addr : &dummy; + + pfn = (unsigned long)(virt_to_phys(stack) & ~(THREAD_SIZE - 1)); + pfn >>= PAGE_SHIFT; + num_pfns = (THREAD_SIZE + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + + for (i = 0; i < num_pfns; i++, pfn++) { + rc = insert_pfn(x, pfn); + if (rc) + return rc; + } + + return 0; +} + +/* + * With a fundamental knowledge of what absolutely MUST be copied + * by the HOST during blackout, add pfn's to the copy range list. + * + * All page tables are added to avoid having to deal with optimizing + * which ptes might be set dirty or accessed by the last few operations + * before blackout. + * + * After page tables, the remainder of the pages are our known "working + * set". This includes the stack frames for this task and processor and + * the pagesync structure (which we are making dirty by compiling this list). + * + * Finally, all pages marked as dirty in their pte's are added to the + * list. These are pages which escaped the harvest/process cycles and + * were presumably dirtied between harvest and here. + */ +static int setup_blackout_copylist(struct pagesync_data *x) +{ + pgd_t *pgd; + + int i, j, k; + int n; + + x->max_range = 0; + + pgd = get_pgd(PAGE_OFFSET); + + if (!pgd) { + printk(KERN_WARNING "%s: null pgd\n", __func__); + return PAGESYNC_ERR_MERGE_FAIL; + } + + if (insert_pfn(x, virt_to_pfn(pgd))) + return PAGESYNC_ERR_MERGE_FAIL; + + for (n = pgd_index(PAGE_OFFSET); n < PTRS_PER_PGD; n++, pgd++) { + + pud_t *pud; + + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + + pud = pud_offset(pgd, 0); + + if (insert_pfn(x, virt_to_pfn(pud))) + return PAGESYNC_ERR_MERGE_FAIL; + + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + + pmd_t *pmd; + + if (pud_none(*pud) || !pud_present(*pud)) + continue; + + pmd = pmd_offset(pud, 0); + + if (insert_pfn(x, virt_to_pfn(pmd))) + return PAGESYNC_ERR_MERGE_FAIL; + + for (j = 0; j < PTRS_PER_PMD; j++, pmd++) { + + pte_t *pte; + + if (pmd_none(*pmd) || !pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if ((pmd_val(*pmd) & _PAGE_DIRTY)) { + unsigned long p, pfn = pmd_pfn(*pmd); + for (p = 0; p < PTRS_PER_PTE; p++) { + if (insert_pfn(x, pfn + p)) + return PAGESYNC_ERR_MERGE_FAIL; + } + } + continue; + } + + pte = pte_offset_kernel(pmd, 0); + + if (insert_pfn(x, virt_to_pfn(pte))) + return PAGESYNC_ERR_MERGE_FAIL; + + for (k = 0; k < PTRS_PER_PTE; k++, pte++) { + + if (pte_none(*pte) || !pte_present(*pte)) + continue; + + if (pte_val(*pte) & _PAGE_DIRTY) { + if (insert_pfn(x, pte_pfn(*pte))) + return PAGESYNC_ERR_MERGE_FAIL; + } + } + } + } + } + + /* copy the stack we are working from */ + if (add_stack_pages(x, NULL)) + return PAGESYNC_ERR_MERGE_FAIL; + + /* add pagesync structure */ + if (insert_pfn(x, virt_to_pfn(x))) + return PAGESYNC_ERR_MERGE_FAIL; + + /* add mm_tracking_struct (disable_tracking() dirtied it). */ + if (insert_pfn(x, virt_to_pfn(&mm_tracking_struct))) + return PAGESYNC_ERR_MERGE_FAIL; + + /* + * add the scratch page, in case the caller wants + * to dirty it on the way to sync. + */ + if (insert_pfn(x, virt_to_pfn(pagesync_scratch_page))) + return PAGESYNC_ERR_MERGE_FAIL; + + if (insert_pfn(x, virt_to_pfn(pagesync_scratch_lock))) + return PAGESYNC_ERR_MERGE_FAIL; + + return PAGESYNC_ERR_OK; +} + +static void harvest_kernel(void) +{ + unsigned long n, i, j, k; + + pgd_t *pgd = get_pgd(PAGE_OFFSET); + + if (!pgd) { + printk(KERN_WARNING "%s: null pgd\n", __func__); + return; + } + + /* Skip addrs < PAGE_OFFSET */ + for (n = pgd_index(PAGE_OFFSET); n < PTRS_PER_PGD; n++, pgd++) { + pud_t *pud; + + if (pgd_none(*pgd) || !pgd_present(*pgd)) + continue; + + pud = pud_offset(pgd, 0); + + if (!pud) { + printk(KERN_WARNING "%s: null pud?\n", __func__); + return; + } + + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + + pmd_t *pmd; + + if (pud_none(*pud) || !pud_present(*pud)) + continue; + + pmd = pmd_offset(pud, 0); + + for (j = 0; j < PTRS_PER_PMD; j++, pmd++) { + pte_t *pte; + pmd_t tmp_pmd = *pmd; + + if (pmd_none(tmp_pmd) || !pmd_present(tmp_pmd)) + continue; + + if (pmd_large(tmp_pmd)) { + if (pmd_val(tmp_pmd) & _PAGE_DIRTY) + set_pmd(&tmp_pmd, __pmd((pmd_val(tmp_pmd) & ~_PAGE_DIRTY) | _PAGE_SOFTDIRTY)); + if (pmd_val(tmp_pmd) != pmd_val(*pmd)) { + set_pmd(pmd, tmp_pmd); + __flush_tlb_one(idx_to_va(n, i, j, 0)); + mm_track_phys((void *)virt_to_phys(pmd)); + } + continue; + } + + pte = pte_offset_kernel(pmd, 0); + + for (k = 0; k < PTRS_PER_PTE; k++, pte++) { + pte_t tmp_pte = *pte; + + if (pte_none(tmp_pte) || !pte_present(tmp_pte)) + continue; + + if (pte_val(tmp_pte) & _PAGE_DIRTY) + set_pte(&tmp_pte, __pte((pte_val(tmp_pte) & ~_PAGE_DIRTY) | _PAGE_SOFTDIRTY)); + + if (pte_val(tmp_pte) != pte_val(*pte)) { + set_pte(pte, tmp_pte); + __flush_tlb_one(idx_to_va(n, i, j, k)); + mm_track_phys((void *) virt_to_phys(pte)); + } + } + } + } + } +} + +static unsigned long harvest(void *arg) +{ +#ifdef CONFIG_SMP + struct pagesync_data * x = (struct pagesync_data *) arg; +#else + if (smp_processor_id() != 0) + return 0; +#endif + + harvest_user(); + +#ifdef CONFIG_SMP + if (smp_processor_id() == 0) + atomic_set(&x->harvest_waiters, 0); + + rendezvous(&x->r); + + if (smp_processor_id() == 0) { + /* Wait for all other cpus to go to the next */ + /* rendezvous (having also flushed their tlbs). */ + while (atomic_read(&x->harvest_waiters) < + num_online_cpus() - 1) + ; /* wait */ + harvest_kernel(); + } +#else + harvest_kernel(); +#endif + + return 0; +} + + +static void pagesync_blackout(void *arg, unsigned int numprocs) +{ + struct blackout_data * x = (struct blackout_data *) arg; + unsigned long flags; + + rendezvous_sched(&x->r[0]); + local_bh_disable(); + rendezvous(&x->r[1]); + local_irq_save(flags); + rendezvous(&x->r[2]); + + (void)x->func(x->arg); + + if (x->arg) { + struct pagesync_data *sdp = (struct pagesync_data *)(x->arg); + __flush_tlb_all(); + atomic_inc(&sdp->harvest_waiters); + } + + rendezvous(&x->r[3]); + + local_irq_restore(flags); + local_bh_enable(); +} + +/* + * Call the specified function with the provided argument in + * during a blackout. + */ +static void pagesync_exec_onall(unsigned long (*func)(void *), void * arg) +{ + struct blackout_data x; + + init_rendezvous(&x.r[0]); + init_rendezvous(&x.r[1]); + init_rendezvous(&x.r[2]); + init_rendezvous(&x.r[3]); + init_rendezvous(&x.r[4]); + x.oncpu = safe_smp_processor_id(); + x.func = func; + x.arg = arg; + + pagesync_syncd_start(pagesync_blackout, &x, 0); + pagesync_syncd_finish(); +} + +static void enable_tracking(void) +{ + /* initialize tracking structure */ + atomic_set(&mm_tracking_struct.count, 0); + memset(mm_tracking_struct.vector, 0, mm_tracking_struct.bitcnt/8); + + /* turn on tracking */ + mm_tracking_struct.active = 1; +} + +static void disable_tracking(void) +{ + /* turn off tracking */ + mm_tracking_struct.active = 0; +} + +static int is_in_copylist(struct pagesync_data *x, unsigned int pfn) +{ + int i; + + for (i = 0; i < x->max_range; i++) { + unsigned int start = x->CopyRange[i].start; + unsigned int end = start + x->CopyRange[i].num; + if (pfn >= start && pfn < end) + return 1; + } + + return 0; +} + +static int bit_is_set(struct pagesync_data *x, + unsigned int pfn, + unsigned long *vector) +{ + if (!x->blackout) + return test_and_clear_bit(pfn, vector); + + /* + * Optimization: We are in blackout and we have already constructed the + * blackout copylist. Any pages in there will be copied by the SMI + * handler (syncpoint). So don't allow process vector to copy any of + * these pages or else they'll get copied twice. + */ + if (is_in_copylist(x, pfn)) + return 0; + + return test_bit(pfn, vector); +} + +static void process_vector(void *arg) +{ + struct pagesync_data * x = (struct pagesync_data *) arg; + unsigned long pfn; + unsigned long start_pfn, runlength; + + /* traverse the the bitmap looking for dirty pages */ + runlength = 0; + start_pfn = 0; + + for (pfn = 0; pfn < mm_tracking_struct.bitcnt; pfn++) { + if (bit_is_set(x, pfn, mm_tracking_struct.vector)) { + if (!(x->blackout)) + atomic_dec(&mm_tracking_struct.count); + if (runlength == 0) + start_pfn = pfn; + runlength++; + } else if (runlength) { + x->status = x->copy(start_pfn, runlength, + x->blackout, x->context); + if (x->status != PAGESYNC_ERR_OK) { + printk(KERN_WARNING "%s: %d error (%d), start_pfn %lx, runlength %lx\n", + __func__, __LINE__, + x->status, + start_pfn, + runlength); + return; + } + start_pfn = runlength = 0; + } + } + + /* Catch the last ones. */ + if (runlength) { + x->status = x->copy(start_pfn, runlength, + x->blackout, x->context); + if (x->status != PAGESYNC_ERR_OK) { + printk(KERN_WARNING "%s: %d\n", __func__, __LINE__); + return; + } + } +} + +/* + * This is where all processors are sent for blackout processing. + * The driving processor is diverted to work on harvest and if necessary + * entry to Common Code for synchronization. All other processors are + * spinning in a rendezvous at the bottom of this routine waiting for + * the driving processor to finish. + */ +static unsigned long brownout_cycle(void *arg) +{ + struct pagesync_data * x = (struct pagesync_data *) arg; + +#ifndef CONFIG_SMP + if (safe_smp_processor_id() != 0) + return 0; +#endif + + harvest_user(); + +#ifdef CONFIG_SMP + if (safe_smp_processor_id() == 0) + atomic_set(&x->harvest_waiters, 0); + + rendezvous(&x->r); + + if (safe_smp_processor_id() != 0) + return 0; +#endif + + while (atomic_read(&x->harvest_waiters) < num_online_cpus() - 1) + ; /* Wait */ + + harvest_kernel(); + + x->bitcnt = atomic_read(&mm_tracking_struct.count); + + if ((x->bitcnt < x->threshold) || (x->pass == 0)) { + x->blackout = IN_BLACKOUT; + + disable_tracking(); + + x->status = setup_blackout_copylist(x); + if (x->status) + return 0; + + process_vector(x); + if (x->status) + return PAGESYNC_ERR_COPY_FAIL; + + x->status = x->sync(x->max_range, + x->CopyRange, + x->context); + x->done = 1; + + if (x->status) { + printk(KERN_WARNING "%s: %d (%s)\n", __func__, __LINE__, + PAGESYNC_ERR_STRING(x->status)); + } + + return 0; + } + + return 0; +} + + + +/* + * This routine is responsible for driving all of the memory + * synchronization processing. When complete the sync function should + * be called to enter blackout and finalize synchronization. + */ +int pagesync_synchronize_memory(copy_proc copy, + sync_proc sync, + unsigned int threshold, + unsigned int passes, + void *context) +{ + int status; + struct pagesync_data *x; + + x = kmalloc(sizeof(struct pagesync_data), GFP_KERNEL); + if (x == NULL) { + printk(KERN_WARNING "unable to allocate pagesync control structure\n"); + return 1; + } + + memset(x, 0, sizeof(struct pagesync_data)); + + x->copy = copy; + x->sync = sync; + x->bitcnt = 0; + x->blackout = IN_BROWNOUT; + x->done = 0; + x->threshold = threshold; + x->context = context; + x->nearby = NEARBY; + + /* clean up hardware dirty bits */ + init_rendezvous(&x->r); + pagesync_exec_onall(harvest, x); + enable_tracking(); + + /* + * Copy all memory. After this we only copy modified + * pages, so we need to make sure the unchanged parts + * get copied at least once. + */ + status = x->copy(0, ~0, 0, x->context); + if (status != PAGESYNC_ERR_OK) { + printk(KERN_WARNING "%s: %d\n", __func__, __LINE__); + goto pagesync_sync_out; + } + + + for (x->pass = passes; (x->pass >= 0); x->pass--) { + DBGPRNT(DBGLVL_BROWNOUT, + "brownout pass %d - %ld pages found (%d)\n", + x->pass, x->bitcnt, + atomic_read(&mm_tracking_struct.count)); + + init_rendezvous(&x->r); + pagesync_exec_onall(brownout_cycle, x); + if (x->status) { + printk(KERN_WARNING "%s(%d): brownout_cycle failed \n", + __func__, __LINE__); + /* flush any incomplete transactions */ + x->copy(~0, ~0, 0, x->context); + break; + } + if (x->done) + break; + + /* Give back some time to everybody else */ + schedule(); + + process_vector(x); + if (x->status) { + printk(KERN_WARNING "%s(%d): process_vector failed \n", + __func__, __LINE__); + /* flush any incomplete transactions */ + x->copy(~0, ~0, 0, x->context); + break; + } + } + + disable_tracking(); + + DBGPRNT(DBGLVL_BROWNOUT, "last pass found %ld pages\n", x->bitcnt); + DBGPRNT(DBGLVL_BROWNOUT, "copy range max %d\n", x->max_range); + + status = x->status; + +pagesync_sync_out: + + kfree(x); + + return status; +} +EXPORT_SYMBOL(pagesync_synchronize_memory); + +int __init pagesync_harvest_init(void) +{ + int result; + extern unsigned long num_physpages; + + + result = pagesync_syncd_init(); + if (result) + return result; + + mm_tracking_struct.vector = vmalloc(num_physpages/8); + mm_tracking_struct.bitcnt = num_physpages; + + if (mm_tracking_struct.vector == NULL) { + printk(KERN_WARNING "%s: unable to allocate memory tracking bitmap\n", + __func__); + goto out_no_mm_track; + } + + pagesync_scratch_page = (void *)__get_free_pages(GFP_KERNEL, 0); + if (!pagesync_scratch_page) + goto out_no_fsp; + + pagesync_scratch_lock = kmalloc(sizeof(spinlock_t), + GFP_KERNEL); + if (!pagesync_scratch_lock) + goto out_no_fsl; + + spin_lock_init(pagesync_scratch_lock); + + return 0; + +out_no_fsl: + free_pages((unsigned long)pagesync_scratch_page, 0); + pagesync_scratch_page = NULL; +out_no_fsp: + vfree(mm_tracking_struct.vector); + mm_tracking_struct.vector = NULL; +out_no_mm_track: + return -ENOMEM; +} + +void __exit pagesync_harvest_cleanup(void) +{ + /* wake me after everyone is sure to be done with mm_track() */ + synchronize_sched(); + + if (mm_tracking_struct.vector) { + vfree(mm_tracking_struct.vector); + mm_tracking_struct.vector = NULL; + } + + if (pagesync_scratch_page) + free_pages((unsigned long)pagesync_scratch_page, 0); + + kfree(pagesync_scratch_lock); + + mm_tracking_struct.bitcnt = 0; + + pagesync_syncd_cleanup(); + + return; +} + +module_init(pagesync_harvest_init); +module_exit(pagesync_harvest_cleanup); +MODULE_LICENSE("GPL"); + +#else /* CONFIG_TRACK_DIRTY_PAGES */ + + +int __init pagesync_harvest_init(void) +{ + return 0; +} + +void __exit pagesync_harvest_cleanup(void) +{ + return; +} + + +#endif /* CONFIG_TRACK_DIRTY_PAGES */ diff -up linux-2.6-git-track/arch/x86/mm/pagesync_harvest.h.harvest linux-2.6-git-track/arch/x86/mm/pagesync_harvest.h --- linux-2.6-git-track/arch/x86/mm/pagesync_harvest.h.harvest 2009-07-21 18:30:57.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/pagesync_harvest.h 2009-07-23 22:46:56.000000000 -0400 @@ -0,0 +1,69 @@ +/* + * Definitions for memory mirroring/harvest. + * + * Copyright (C) 2006, 2009 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef __HARVEST_H__ +#define __HARVEST_H__ + +enum { + PAGESYNC_ERR_OK = 0, + PAGESYNC_ERR_MERGE_FAIL, + PAGESYNC_ERR_COPY_FAIL, + PAGESYNC_ERR_SYNC_FAIL, +}; + +#define PAGESYNC_ERR_STRING(x) \ +({ \ + char *retval; \ + \ + switch (x) { \ + case PAGESYNC_ERR_OK: \ + retval = "OK"; \ + break; \ + case PAGESYNC_ERR_MERGE_FAIL: \ + retval = "Merge failed"; \ + break; \ + case PAGESYNC_ERR_COPY_FAIL: \ + retval = "Copy failed"; \ + break; \ + case PAGESYNC_ERR_SYNC_FAIL: \ + retval = "Sync failed"; \ + break; \ + default: \ + retval = "Unknown error"; \ + } \ + \ + retval; \ +}) + +struct pagesync_page_range { + unsigned int start; + unsigned int num; +}; + +typedef int (*copy_proc)(unsigned long start_pfn, + unsigned long count, + int blackout, void *context); +typedef int (*sync_proc)(int max_range, + struct pagesync_page_range *copy_range, + void *context); + +int pagesync_synchronize_memory(copy_proc, sync_proc, + unsigned int, unsigned int, void *); + +#endif /* __HARVEST_H__ */ diff -up linux-2.6-git-track/arch/x86/mm/pagesync.h.harvest linux-2.6-git-track/arch/x86/mm/pagesync.h --- linux-2.6-git-track/arch/x86/mm/pagesync.h.harvest 2009-07-21 18:30:57.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/pagesync.h 2009-07-22 13:11:21.000000000 -0400 @@ -0,0 +1,66 @@ +/* + * pagesync global definitions, prototypes and macros + * + * All kernel header files needed by all components should be defined + * here. Additional local needs will be in their respective + * components. + * + * Copyright (C) 2006, 2007, 2009 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> + +/*********************************************************************** + Macro Definitions + ***********************************************************************/ + +#define DBG(args...) \ +({ \ + if (pagesync_debug) \ + printk(args); \ +}) + +#define HERE() DBG(KERN_DEBUG "%s(%d)\n", __func__, __LINE__) + +/*********************************************************************** + Module global variables + ***********************************************************************/ + +/*********************************************************************** + Initialization and Cleanup Prototypes + ***********************************************************************/ + +#define INIT_CLEANUP_DECLS(NS, COMPONENT) \ + int NS ## _ ## COMPONENT ## _init(void); \ + void NS ## _ ## COMPONENT ## _cleanup(void) + +INIT_CLEANUP_DECLS(pagesync, harvest); +INIT_CLEANUP_DECLS(pagesync, syncd); + +#undef INIT_CLEANUP_DECLS + +/*********************************************************************** + Prototypes + ***********************************************************************/ +void pagesync_syncd_sched(void); +void pagesync_restart(void); + +extern unsigned int pagesync_debug; + + diff -up linux-2.6-git-track/arch/x86/mm/pagesync_syncd.c.harvest linux-2.6-git-track/arch/x86/mm/pagesync_syncd.c --- linux-2.6-git-track/arch/x86/mm/pagesync_syncd.c.harvest 2009-07-21 18:30:57.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/pagesync_syncd.c 2009-07-24 14:48:15.000000000 -0400 @@ -0,0 +1,259 @@ +/* + * Synchronization threads. Used when there is a need to run a function + * on one or more CPUs. Typically used to capture all CPUs for blackout + * or other critical region processing where processors must be executing + * in a controlled manner. + * + * Copyright (C) 2006, 2009 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifdef CONFIG_TRACK_DIRTY_PAGES + +#include "pagesync.h" + +#include <linux/sched.h> +#include <linux/version.h> +#include <linux/preempt.h> + +#include "pagesync_syncd.h" + +struct syncd_struct { + int target_cpu; + sync_function_t func; + void *arg; + unsigned int num_procs; + struct semaphore ack; +}; + +static struct { + wait_queue_head_t wait; + unsigned long syncd_in_use; +} syncd_event; + +static struct syncd_struct syncd[NR_CPUS]; +static struct thread_state syncd_state[NR_CPUS]; +static DECLARE_MUTEX(syncd_sem); +static cpumask_t old_cpus_allowed; + +unsigned int pagesync_debug = 1; + +static int +syncd_on_processor_mask(sync_function_t func, void *arg, cpumask_t mask, + unsigned int num_procs) +{ + int i; + +#ifdef DEBUG + { + char buf[40]; + + cpumask_scnprintf(buf, sizeof(buf), &mask); + DBG("%s: requesting from cpu %d mask %s\n", + __func__, smp_processor_id(), buf); + } +#endif + + preempt_disable(); + + for (i = 0; i < num_online_cpus(); i++) { + /* skip over cpu's not marked in the mask */ + if (!cpu_isset(i, mask)) + continue; + + syncd[i].func = func; + syncd[i].arg = arg; + syncd[i].num_procs = num_procs; + + /* wake syncd */ + DBG("%s: waking syncd%d\n", __func__, i); + THREAD_WAKE(syncd_state[i]); + } + + preempt_enable(); + + for (i = 0; i < num_online_cpus(); i++) { + /* skip over cpu's not marked in the mask */ + if (!cpu_isset(i, mask)) + continue; + + /* wait here for syncd to check in */ + down(&syncd[i].ack); + } + + return 0; +} + +static int +syncd_on_all_processors(sync_function_t func, void *arg) +{ + int retval = 0; + + DBG("%s: enter\n", __func__); + + retval = syncd_on_processor_mask(func, arg, + cpu_online_map, num_online_cpus()); + + DBG("%s: exit\n", __func__); + + return retval; +} + + +atomic_t syncd_awake_count; /* How many syncds not on wait q? */ + +int +pagesync_syncd_start(sync_function_t func, void *arg, int cpu) +{ + down(&syncd_sem); + + atomic_set(&syncd_awake_count, num_online_cpus()); + return syncd_on_all_processors(func, arg); +} +EXPORT_SYMBOL(pagesync_syncd_start); + +void +pagesync_syncd_finish(void) +{ + wait_event(syncd_event.wait, (syncd_event.syncd_in_use == 0)); + while (atomic_read(&syncd_awake_count)) + schedule(); + up(&syncd_sem); +} +EXPORT_SYMBOL(pagesync_syncd_finish); + +/*****************************************************************/ + +static int syncd_thread(void *arg) +{ + struct thread_state *state = (struct thread_state *) arg; + struct syncd_struct *s = (struct syncd_struct *) state->data; + int target_cpu = s->target_cpu; + cpumask_t mask = CPU_MASK_NONE; + sync_function_t event_func = NULL; + void *event_arg = NULL; + unsigned int event_num_procs = 0; + int tries; + + THREAD_INIT(state); + + /* after we schedule, we should wake on the target cpu only */ + cpu_set(target_cpu, mask); + set_cpus_allowed(current, mask); + + /* raise our priority such that we get chosen to run 'next' */ + current->policy = SCHED_RR; + current->rt_priority = 1; + + while (!THREAD_SHOULD_STOP(state)) { + + THREAD_WAIT(state); + + if (THREAD_SHOULD_STOP(state)) + break; + + tries = 5; + while (target_cpu != smp_processor_id()) { + printk(KERN_ERR + "syncd%d woke on cpu%d trying again\n", + target_cpu, smp_processor_id()); + + yield(); + + if (tries-- == 0) { + printk(KERN_ERR "syncd%d woke on cpu%d " + "trying again\n", target_cpu, + smp_processor_id()); + printk(KERN_ERR "syncd%d exiting\n", + target_cpu); + + /* + * XXX should probably signal an event + * to admin to have driver reloaded + * in the short term + */ + goto exit; + } + } + + event_func = s->func; + event_arg = s->arg; + event_num_procs = s->num_procs; + s->func = NULL; + s->arg = NULL; + s->num_procs = 0; + + DBG("syncd%d responding on cpu %d\n", + target_cpu, smp_processor_id()); + set_bit(target_cpu, &syncd_event.syncd_in_use); + + /* ack we are processing the event */ + up(&s->ack); + + /* process the event */ + if (event_func) + (*event_func)(event_arg, event_num_procs); + + /* notify the event we have completed */ + clear_bit(target_cpu, &syncd_event.syncd_in_use); + wake_up(&syncd_event.wait); + } + exit: + THREAD_EXIT(state) +} + +int +pagesync_syncd_init(void) +{ + int i; + + init_waitqueue_head(&syncd_event.wait); + syncd_event.syncd_in_use = 0; + + for (i = 0; i < num_online_cpus(); i++) { + char name[16]; + + if (!cpu_online(i)) + continue; + + syncd[i].target_cpu = i; + syncd[i].func = NULL; + syncd[i].arg = NULL; + init_MUTEX_LOCKED(&syncd[i].ack); + + snprintf(name, 16, "syncd/%d", i); + THREAD_START(syncd_state[i], syncd_thread, &syncd[i], name); + } + + return 0; +} + +void +pagesync_syncd_cleanup(void) +{ + int i; + + for (i = 0; i < num_online_cpus(); i++) + THREAD_STOP(syncd_state[i]); +} + +void +pagesync_syncd_sched(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL(pagesync_syncd_sched); + +#endif /* CONFIG_TRACK_DIRTY_PAGES */ diff -up linux-2.6-git-track/arch/x86/mm/pagesync_syncd.h.harvest linux-2.6-git-track/arch/x86/mm/pagesync_syncd.h --- linux-2.6-git-track/arch/x86/mm/pagesync_syncd.h.harvest 2009-07-21 18:30:57.000000000 -0400 +++ linux-2.6-git-track/arch/x86/mm/pagesync_syncd.h 2009-07-23 22:41:59.000000000 -0400 @@ -0,0 +1,185 @@ +/* + * Definitions and prototypes for syncd. + * + * Copyright (C) 2005, 2007, 2009 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef __SYNCD_H__ +#define __SYNCD_H__ + +#include <linux/sched.h> +#include <linux/semaphore.h> + +struct thread_state { + int valid; + + pid_t pid; + + char *name; + void *data; + + int should_stop; + wait_queue_head_t waitq; + + struct completion started; + struct task_struct *result; + struct completion done; +}; + +#define THREAD_INIT(state) \ +({ \ + daemonize(state->name); \ + \ + /* allow kernel thread to be reaped. */ \ + current->exit_signal = SIGCHLD; \ + \ + complete(&state->started); \ +}) + +#define THREAD_EXIT(state) complete_and_exit(&state->done, 0); + +#define THREAD_SHOULD_STOP(state) (state->should_stop) + +#define THREAD_START(state, func, fndata, thrdname) \ +({ \ + pid_t pid; \ + \ + state.valid = 1; \ + state.name = thrdname; \ + state.data = fndata; \ + init_completion(&state.started); \ + init_completion(&state.done); \ + init_waitqueue_head(&state.waitq); \ + state.should_stop = 0; \ + \ + pid = kernel_thread(func, &state, 0); \ + if (pid < 0) \ + return pid; \ + \ + wait_for_completion(&state.started); \ + \ + state.pid = pid; \ + state.result = pid_task(find_pid_ns(pid, &init_pid_ns), PIDTYPE_PID); \ +}) + +#define THREAD_STOP(state) \ +({ \ + if (state.valid) { \ + state.should_stop = 1; \ + wake_up_interruptible(&state.waitq); \ + wait_for_completion(&state.done); \ + } \ +}) + +#define THREAD_WAKE(state) \ +({ \ + if (state.valid) \ + wake_up_interruptible(&state.waitq); \ +}) + +#define THREAD_SLEEP_ON_EVENT(state, condition, timeout) \ +({ \ + int retval; \ + retval = wait_event_interruptible_timeout(state->waitq, \ + condition, \ + timeout); \ + if (retval == -ERESTARTSYS) { /* we were interrupted */ \ + retval = 0; \ + } \ + if (retval == 0) { /* timed out */ \ + ; \ + } \ + if (retval >= 0) { /* condition satisfied */ \ + ; \ + } \ + \ + retval; \ +}) + +#define THREAD_SLEEP(state, timeout) \ +({ \ + int retval; \ + retval = THREAD_SLEEP_ON_EVENT(state, \ + THREAD_SHOULD_STOP(state), \ + timeout); \ + retval; \ +}) + +extern atomic_t syncd_awake_count; + +#define THREAD_WAIT(state) \ +({ \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&state->waitq, &__wait); \ + set_current_state(TASK_INTERRUPTIBLE); \ + atomic_dec(&syncd_awake_count); \ + schedule(); \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&state->waitq, &__wait); \ +}) + + +enum syncd_request_type { + SYNCD_ON_ALL_PROCESSORS = 1, + SYNCD_ON_OTHER_PROCESSORS, + SYNCD_ON_ONE_PROCESSOR, +}; + +typedef void (*sync_function_t)(void *, unsigned int); + +int pagesync_syncd_start(sync_function_t func, void *arg, int cpu); +void pagesync_syncd_finish(void); + +void pagesync_syncd_sched(void); + +int pagesync_syncd_init(void); +void pagesync_syncd_cleanup(void); + +/*********************************************************************** + Rendezvous + ***********************************************************************/ + +static inline void +init_rendezvous(atomic_t *r) +{ + atomic_set(r, num_online_cpus()); +} + +/** + * Capture CPUs here, waiting for everyone to check in. Once everyone has + * arrived all can leave. + */ +static inline void +rendezvous(atomic_t *r) +{ + atomic_dec(r); + + while (atomic_read(r) != 0) + barrier(); +} + +static inline void +rendezvous_sched(atomic_t *r) +{ + atomic_dec(r); + + while (atomic_read(r) != 0) + schedule(); +} + +#endif /* __SYNCD_H__ */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@...r.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists