The core portion of the cpu allocator. The per cpu allocator allows dynamic allocation of memory on all processor simultaneously. A bitmap is used to track used areas. The allocator implements tight packing to reduce the cache footprint and increase speed since cacheline contention is typically not a concern for memory mainly used by a single cpu. Small objects will fill up gaps left by larger allocations that required alignments. Signed-off-by: Christoph Lameter --- include/linux/cpu_alloc.h | 56 ++++++ include/linux/mm.h | 13 + include/linux/vmstat.h | 2 mm/Kconfig | 33 +++ mm/Makefile | 3 mm/cpu_alloc.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++ mm/vmstat.c | 1 7 files changed, 512 insertions(+), 3 deletions(-) Index: linux-2.6/mm/cpu_alloc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/mm/cpu_alloc.c 2007-11-06 06:05:06.000000000 +0000 @@ -0,0 +1,407 @@ +/* + * Cpu allocator - Manage objects allocated for each processor + * + * (C) 2007 SGI, Christoph Lameter + * Basic implementation with allocation and free from a dedicated per + * cpu area. + * + * The per cpu allocator allows dynamic allocation of memory on all + * processor simultaneously. A bitmap is used to track used areas. + * The allocator implements tight packing to reduce the cache footprint + * and increase speed since cacheline contention is typically not a concern + * for memory mainly used by a single cpu. Small objects will fill up gaps + * left by larger allocations that required alignments. + */ +#include +#include +#include +#include +#include +#include +#include +#include /* i386 definition of init_mm */ +#include /* i386 dependency on highmem config */ +#include +#include + +/* + * Basic allocation unit. A bit map is created to track the use of each + * UNIT_SIZE element in the cpu area. + */ + +#define UNIT_SIZE sizeof(int) +#define UNITS_PER_BLOCK (ALLOC_SIZE / UNIT_SIZE) + +/* + * Lock to protect the bitmap and the meta data for the cpu allocator. + */ +static DEFINE_SPINLOCK(cpu_alloc_map_lock); + +#ifdef CONFIG_CPU_AREA_VIRTUAL + +/* + * Virtualized cpu area. The cpu area can be extended if more space is needed. + */ + +#define cpu_area ((u8 *)(CPU_AREA_BASE)) +#define ALLOC_SIZE (1UL << (CONFIG_CPU_AREA_ALLOC_ORDER + PAGE_SHIFT)) + +/* + * The maximum number of blocks is the maximum size of the + * cpu area for one processor divided by the size of an allocation + * block. + */ +#define MAX_BLOCKS (1UL << (CONFIG_CPU_AREA_ORDER - \ + CONFIG_CPU_AREA_ALLOC_ORDER)) + + +static unsigned long *cpu_alloc_map = NULL; +static int cpu_alloc_map_order = -1; /* Size of the bitmap in page order */ +static unsigned long active_blocks; /* Number of block allocated on each cpu */ +static unsigned long units_free; /* Number of available units */ +static unsigned long units_total; /* Total units that are managed */ + +/* + * Allocate a block of memory to be used to provide cpu area memory + * or to extend the bitmap for the cpu map. + */ +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node) +{ + struct page *page = alloc_pages_node(node, + flags, get_order(size)); + if (page) + return page_address(page); + return NULL; +} + +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node) +{ + unsigned long addr = (unsigned long)start; + unsigned long end = addr + size; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = cpu_area_pgd_populate(addr, flags, node); + if (!pgd) + return -ENOMEM; + pud = cpu_area_pud_populate(pgd, addr, flags, node); + if (!pud) + return -ENOMEM; + pmd = cpu_area_pmd_populate(pud, addr, flags, node); + if (!pmd) + return -ENOMEM; + pte = cpu_area_pte_populate(pmd, addr, flags, node); + if (!pte) + return -ENOMEM; + } + + return 0; +} + +/* + * If no other population function is defined then this function will stand + * in and provide the capability to map PAGE_SIZE pages into the cpu area. + */ +int __attribute__((weak)) cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node) +{ + return cpu_area_populate_basepages(start, size, flags, node); +} + +/* + * Extend the areas on all processors. This function may be called repeatedly + * until we have enough space to accomodate a newly allocated object. + * + * Must hold the cpu_alloc_map_lock on entry. Will drop the lock and then + * regain it. + */ +static int expand_cpu_area(gfp_t flags) +{ + unsigned long blocks = active_blocks; + unsigned long bits; + int cpu; + int err = -ENOMEM; + int map_order; + unsigned long *new_map = NULL; + void *start; + + if (active_blocks == MAX_BLOCKS) + goto out; + + spin_unlock(&cpu_alloc_map_lock); + + /* + * Determine the size of the bit map needed + */ + bits = (blocks + 1) * UNITS_PER_BLOCK; + map_order = get_order(DIV_ROUND_UP(bits, 8)); + start = cpu_area + \ + (blocks << (PAGE_SHIFT + CONFIG_CPU_AREA_ALLOC_ORDER)); + + for_each_possible_cpu(cpu) { + err = cpu_area_populate(CPU_PTR(start, cpu), ALLOC_SIZE, + flags, cpu_to_node(cpu)); + + if (err) { + spin_lock(&cpu_alloc_map_lock); + goto out; + } + } + + if (map_order > cpu_alloc_map_order) { + new_map = cpu_area_alloc_block(PAGE_SIZE << map_order, + flags | __GFP_ZERO, 0); + if (!new_map) + goto out; + } + + spin_lock(&cpu_alloc_map_lock); + + /* + * We dropped the lock. Another processor may have already extended + * the cpu area size as needed. + */ + if (blocks != active_blocks) { + if (new_map) + free_pages((unsigned long)new_map, + map_order); + err = 0; + goto out; + } + + if (new_map) { + /* + * Need to extend the bitmap + */ + if (cpu_alloc_map) + memcpy(new_map, cpu_alloc_map, + PAGE_SIZE << cpu_alloc_map_order); + cpu_alloc_map = new_map; + cpu_alloc_map_order = map_order; + } + + active_blocks++; + units_total += UNITS_PER_BLOCK; + units_free += UNITS_PER_BLOCK; + err = 0; +out: + return err; +} + +#else + +/* + * Static fallback configuration. The cpu areas are of a fixed size and + * cannot be extended. Such configurations are mainly useful on + * machines that do not have MMU support. + */ +#define MAX_BLOCKS 1 +#define ALLOC_SIZE (1UL << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)) + +static u8 cpu_area[NR_CPUS * ALLOC_SIZE]; +static DECLARE_BITMAP(cpu_alloc_map, UNITS_PER_BLOCK); +static int units_free = UNITS_PER_BLOCK; +#define cpu_alloc_map_order CONFIG_CPU_AREA_ORDER +#define units_total UNITS_PER_BLOCK + +static inline int expand_cpu_area(gfp_t flags) +{ + return -ENOSYS; +} +#endif + +static int first_free; /* First known free unit */ + +/* + * How many units are needed for an object of a given size + */ +static int size_to_units(unsigned long size) +{ + return DIV_ROUND_UP(size, UNIT_SIZE); +} + +/* + * Mark an object as used in the cpu_alloc_map + * + * Must hold cpu_alloc_map_lock + */ +static void set_map(int start, int length) +{ + while (length-- > 0) + __set_bit(start++, cpu_alloc_map); +} + +/* + * Mark an area as freed. + * + * Must hold cpu_alloc_map_lock + */ +static void clear_map(int start, int length) +{ + while (length-- > 0) + __clear_bit(start++, cpu_alloc_map); +} + +/* + * Allocate an object of a certain size + * + * Returns a special pointer that can be used with CPU_PTR to find the + * address of the object for a certain cpu. + */ +void *cpu_alloc(unsigned long size, gfp_t gfpflags, unsigned long align) +{ + unsigned long start; + int units = size_to_units(size); + void *ptr; + int first; + unsigned long map_size; + + BUG_ON(gfpflags & ~(GFP_RECLAIM_MASK | __GFP_ZERO)); + + spin_lock(&cpu_alloc_map_lock); + +restart: + map_size = PAGE_SIZE << cpu_alloc_map_order; + first = 1; + start = first_free; + + for ( ; ; ) { + + start = find_next_zero_bit(cpu_alloc_map, map_size, start); + if (first) + first_free = start; + + if (start >= units_total) { + if (expand_cpu_area(gfpflags)) + goto out_of_memory; + goto restart; + } + + /* + * Check alignment and that there is enough space after + * the starting unit. + */ + if (start % (align / UNIT_SIZE) == 0 && + find_next_bit(cpu_alloc_map, map_size, start + 1) + >= start + units) + break; + start++; + first = 0; + } + + if (first) + first_free = start + units; + + while (start + units > units_total) { + if (expand_cpu_area(gfpflags)) + goto out_of_memory; + } + + set_map(start, units); + units_free -= units; + __count_vm_events(CPU_BYTES, units * UNIT_SIZE); + + spin_unlock(&cpu_alloc_map_lock); + + ptr = cpu_area + start * UNIT_SIZE; + + if (gfpflags & __GFP_ZERO) { + int cpu; + + for_each_possible_cpu(cpu) + memset(CPU_PTR(ptr, cpu), 0, size); + } + + return ptr; + +out_of_memory: + spin_unlock(&cpu_alloc_map_lock); + return NULL; +} +EXPORT_SYMBOL(cpu_alloc); + +/* + * Free an object. The pointer must be a cpu pointer allocated + * via cpu_alloc. + */ +void cpu_free(void *start, unsigned long size) +{ + int units = size_to_units(size); + int index; + u8 *p = start; + + BUG_ON(p < cpu_area); + index = (p - cpu_area) / UNIT_SIZE; + BUG_ON(!test_bit(index, cpu_alloc_map) || + index >= units_total); + + spin_lock(&cpu_alloc_map_lock); + + clear_map(index, units); + units_free += units; + __count_vm_events(CPU_BYTES, -units * UNIT_SIZE); + if (index < first_free) + first_free = index; + + spin_unlock(&cpu_alloc_map_lock); +} +EXPORT_SYMBOL(cpu_free); + + Index: linux-2.6/include/linux/vmstat.h =================================================================== --- linux-2.6.orig/include/linux/vmstat.h 2007-11-06 06:05:02.000000000 +0000 +++ linux-2.6/include/linux/vmstat.h 2007-11-06 06:05:06.000000000 +0000 @@ -36,7 +36,7 @@ FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, - PAGEOUTRUN, ALLOCSTALL, PGROTATED, + PAGEOUTRUN, ALLOCSTALL, PGROTATED, CPU_BYTES, NR_VM_EVENT_ITEMS }; Index: linux-2.6/mm/Makefile =================================================================== --- linux-2.6.orig/mm/Makefile 2007-11-06 06:05:02.000000000 +0000 +++ linux-2.6/mm/Makefile 2007-11-06 06:05:06.000000000 +0000 @@ -11,7 +11,7 @@ page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o cpu_alloc.o $(mmu-y) obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o @@ -30,4 +30,3 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o - Index: linux-2.6/mm/vmstat.c =================================================================== --- linux-2.6.orig/mm/vmstat.c 2007-11-06 06:05:02.000000000 +0000 +++ linux-2.6/mm/vmstat.c 2007-11-06 06:05:06.000000000 +0000 @@ -642,6 +642,7 @@ "allocstall", "pgrotated", + "cpu_bytes", #endif }; Index: linux-2.6/include/linux/cpu_alloc.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/linux/cpu_alloc.h 2007-11-06 06:05:06.000000000 +0000 @@ -0,0 +1,56 @@ +/* + * include/linux/cpu_alloc.h - cpu allocator definitions + * + * The cpu allocator allows allocating an array of objects on all processors. + * A single pointer can then be used to access the instance of the object + * on a particular processor. + * + * Cpu objects are typically small. The allocator packs them tightly + * to increase the chance on each access that a per cpu object is already + * cached. Alignments may be specified but the intent is to align the data + * properly due to cpu alignment constraints and not to avoid cacheline + * contention. Any holes left by aligning objects are filled up with smaller + * objects that are allocated later. + * + * Cpu data can be allocated using CPU_ALLOC. The resulting pointer is + * pointing to the instance of the variable on cpu 0. It is generally an + * error to use the pointer directly unless we are running on cpu 0. So + * the use is valid during boot for example. + * + * The GFP flags have their usual function: __GFP_ZERO zeroes the object + * and other flags may be used to control reclaim behavior if the cpu + * areas have to be extended. However, zones cannot be selected nor + * can locality constraint flags be used. + * + * CPU_PTR() may be used to calculate the pointer for a specific processor. + * CPU_PTR is highly scalable since it simply adds the shifted value of + * smp_processor_id() to the base. + * + * Note: Synchronization is up to caller. If preemption is disabled then + * it is generally safe to access cpu variables (unless they are also + * handled from an interrupt context). + */ +#ifndef _LINUX_CPU_ALLOC_H_ +#define _LINUX_CPU_ALLOC_H_ + +#define CPU_OFFSET(__cpu) \ + ((unsigned long)(__cpu) << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)) + +#define CPU_PTR(__p, __cpu) ((__typeof__(__p))((void *)(__p) + \ + CPU_OFFSET(__cpu))) + +#define CPU_ALLOC(type, flags) cpu_alloc(sizeof(type), flags, \ + __alignof__(type)) +#define CPU_FREE(pointer) cpu_free(pointer, sizeof(*(pointer))) + +#define THIS_CPU(__p) CPU_PTR(__p, smp_processor_id()) +#define __THIS_CPU(__p) CPU_PTR(__p, raw_smp_processor_id()) + +/* + * Raw calls + */ +void *cpu_alloc(unsigned long size, gfp_t gfp, unsigned long align); +void cpu_free(void *cpu_pointer, unsigned long size); + +#endif /* _LINUX_CPU_ALLOC_H_ */ + Index: linux-2.6/mm/Kconfig =================================================================== --- linux-2.6.orig/mm/Kconfig 2007-11-06 06:05:02.000000000 +0000 +++ linux-2.6/mm/Kconfig 2007-11-06 06:06:01.000000000 +0000 @@ -194,3 +194,15 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config CPU_AREA_ORDER + int "Maximum order of CPU area" + default "16" if CPU_AREA_VIRTUAL + default "4" if !CPU_AREA_VIRTUAL + help + Sets the maximum amount of memory that can be allocated via cpu_alloc + The size is set in page order. The size set (times the maximum + number of processors) determines the amount of virtual memory that + is set aside for the per cpu areas. + + Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h 2007-11-06 06:05:02.000000000 +0000 +++ linux-2.6/include/linux/mm.h 2007-11-06 06:05:06.000000000 +0000 @@ -1141,5 +1141,18 @@ unsigned long pages, int node); int vmemmap_populate(struct page *start_page, unsigned long pages, int node); +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node); +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node); +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node); +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node); +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node); +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node); +int cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/