It needs more thoughts, and is not a must have for .31. CC: Nick Piggin CC: Hugh Dickins Signed-off-by: Wu Fengguang --- Documentation/sysctl/vm.txt | 28 --- include/linux/mm.h | 1 include/linux/rmap.h | 6 kernel/sysctl.c | 13 - mm/filemap.c | 4 mm/memory-failure.c | 272 ---------------------------------- mm/rmap.c | 8 - 7 files changed, 3 insertions(+), 329 deletions(-) --- sound-2.6.orig/mm/memory-failure.c +++ sound-2.6/mm/memory-failure.c @@ -48,251 +48,9 @@ #include #include "internal.h" -int sysctl_memory_failure_early_kill __read_mostly = 1; - atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); /* - * Send all the processes who have the page mapped an ``action optional'' - * signal. - */ -static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, - unsigned long pfn) -{ - struct siginfo si; - int ret; - - printk(KERN_ERR - "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", - pfn, t->comm, t->pid); - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_MCEERR_AO; - si.si_addr = (void *)addr; -#ifdef __ARCH_SI_TRAPNO - si.si_trapno = trapno; -#endif - si.si_addr_lsb = PAGE_SHIFT; - /* - * Don't use force here, it's convenient if the signal - * can be temporarily blocked. - * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? - */ - ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ - if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); - return ret; -} - -/* - * Kill all processes that have a poisoned page mapped and then isolate - * the page. - * - * General strategy: - * Find all processes having the page mapped and kill them. - * But we keep a page reference around so that the page is not - * actually freed yet. - * Then stash the page away - * - * There's no convenient way to get back to mapped processes - * from the VMAs. So do a brute-force search over all - * running processes. - * - * Remember that machine checks are not common (or rather - * if they are common you have other problems), so this shouldn't - * be a performance issue. - * - * Also there are some races possible while we get from the - * error detection to actually handle it. - */ - -struct to_kill { - struct list_head nd; - struct task_struct *tsk; - unsigned long addr; - unsigned addr_valid:1; -}; - -/* - * Failure handling: if we can't find or can't kill a process there's - * not much we can do. We just print a message and ignore otherwise. - */ - -/* - * Schedule a process for later kill. - * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * TBD would GFP_NOIO be enough? - */ -static void add_to_kill(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill, - struct to_kill **tkc) -{ - struct to_kill *tk; - - if (*tkc) { - tk = *tkc; - *tkc = NULL; - } else { - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); - if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); - return; - } - } - tk->addr = page_address_in_vma(p, vma); - tk->addr_valid = 1; - - /* - * In theory we don't have to kill when the page was - * munmaped. But it could be also a mremap. Since that's - * likely very rare kill anyways just out of paranoia, but use - * a SIGKILL because the error is not contained anymore. - */ - if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", - page_to_pfn(p), tsk->comm); - tk->addr_valid = 0; - } - get_task_struct(tsk); - tk->tsk = tsk; - list_add_tail(&tk->nd, to_kill); -} - -/* - * Kill the processes that have been collected earlier. - * - * Only do anything when DOIT is set, otherwise just free the list - * (this is used for clean pages which do not need killing) - * Also when FAIL is set do a force kill because something went - * wrong earlier. - */ -static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, - int fail, unsigned long pfn) -{ - struct to_kill *tk, *next; - - list_for_each_entry_safe (tk, next, to_kill, nd) { - if (doit) { - /* - * In case something went wrong with munmaping - * make sure the process doesn't catch the - * signal and then access the memory. Just kill it. - * the signal handlers - */ - if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); - force_sig(SIGKILL, tk->tsk); - } - - /* - * In theory the process could have mapped - * something else on the address in-between. We could - * check for that, but we need to tell the - * process anyways. - */ - else if (kill_proc_ao(tk->tsk, tk->addr, trapno, - pfn) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); - } - put_task_struct(tk->tsk); - kfree(tk); - } -} - -/* - * Collect processes when the error hit an anonymous page. - */ -static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) -{ - struct vm_area_struct *vma; - struct task_struct *tsk; - struct anon_vma *av; - - read_lock(&tasklist_lock); - - av = page_lock_anon_vma(page); - if (av == NULL) /* Not actually mapped anymore */ - goto out; - - for_each_process (tsk) { - if (!tsk->mm) - continue; - list_for_each_entry (vma, &av->head, anon_vma_node) { - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); - } - } - page_unlock_anon_vma(av); -out: - read_unlock(&tasklist_lock); -} - -/* - * Collect processes when the error hit a file mapped page. - */ -static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) -{ - struct vm_area_struct *vma; - struct task_struct *tsk; - struct prio_tree_iter iter; - struct address_space *mapping = page_mapping(page); - - /* - * A note on the locking order between the two locks. - * We don't rely on this particular order. - * If you have some other code that needs a different order - * feel free to switch them around. Or add a reverse link - * from mm_struct to task_struct, then this could be all - * done without taking tasklist_lock and looping over all tasks. - */ - - read_lock(&tasklist_lock); - spin_lock(&mapping->i_mmap_lock); - for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (!tsk->mm) - continue; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, - pgoff) - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); - } - spin_unlock(&mapping->i_mmap_lock); - read_unlock(&tasklist_lock); -} - -/* - * Collect the processes who have the corrupted page mapped to kill. - * This is done in two steps for locking reasons. - * First preallocate one tokill structure outside the spin locks, - * so that we can kill at least one process reasonably reliable. - */ -static void collect_procs(struct page *page, struct list_head *tokill) -{ - struct to_kill *tk; - - tk = kmalloc(sizeof(struct to_kill), GFP_KERNEL); - /* memory allocation failure is implicitly handled */ - if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); - else - collect_procs_file(page, tokill, &tk); - kfree(tk); -} - -/* * Error handlers for various types of pages. */ @@ -599,7 +357,6 @@ static void hwpoison_user_mappings(struc int trapno) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; - int kill = sysctl_memory_failure_early_kill; struct address_space *mapping; LIST_HEAD(tokill); int ret; @@ -633,7 +390,6 @@ static void hwpoison_user_mappings(struc if (page_mkclean(p)) SetPageDirty(p); else { - kill = 0; ttu |= TTU_IGNORE_HWPOISON; printk(KERN_INFO "MCE %#lx: corrupted page was clean: dropped without side effects\n", @@ -642,22 +398,6 @@ static void hwpoison_user_mappings(struc } /* - * First collect all the processes that have the page - * mapped. This has to be done before try_to_unmap, - * because ttu takes the rmap data structures down. - * - * This also has the side effect to propagate the dirty - * bit from PTEs into the struct page. This is needed - * to actually decide if something needs to be killed - * or errored, or if it's ok to just drop the page. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. - */ - if (kill) - collect_procs(p, &tokill); - - /* * try_to_unmap can fail temporarily due to races. * Try a few times (RED-PEN better strategy?) */ @@ -671,18 +411,6 @@ static void hwpoison_user_mappings(struc if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(p)); - - /* - * Now that the dirty bit has been propagated to the - * struct page and all unmaps done we can decide if - * killing is needed or not. Only kill when the page - * was dirty, otherwise the tokill list is merely - * freed. When there was a problem unmapping earlier - * use a more force-full uncatchable kill to prevent - * any accesses to the poisoned memory. - */ - kill_procs_ao(&tokill, !!PageDirty(p), trapno, - ret != SWAP_SUCCESS, pfn); } /** --- sound-2.6.orig/Documentation/sysctl/vm.txt +++ sound-2.6/Documentation/sysctl/vm.txt @@ -32,7 +32,6 @@ Currently, these files are in /proc/sys/ - legacy_va_layout - lowmem_reserve_ratio - max_map_count -- memory_failure_early_kill - min_free_kbytes - min_slab_ratio - min_unmapped_ratio @@ -54,6 +53,7 @@ Currently, these files are in /proc/sys/ - vfs_cache_pressure - zone_reclaim_mode + ============================================================== block_dump @@ -275,32 +275,6 @@ e.g., up to one or two maps per allocati The default value is 65536. -============================================================= - -memory_failure_early_kill: - -Control how to kill processes when uncorrected memory error (typically -a 2bit error in a memory module) is detected in the background by hardware -that cannot be handled by the kernel. In some cases (like the page -still having a valid copy on disk) the kernel will handle the failure -transparently without affecting any applications. But if there is -no other uptodate copy of the data it will kill to prevent any data -corruptions from propagating. - -1: Kill all processes that have the corrupted and not reloadable page mapped -as soon as the corruption is detected. Note this is not supported -for a few types of pages, like kernel internally allocated data or -the swap cache, but works for the majority of user pages. - -0: Only unmap the corrupted page from all processes and only kill a process -who tries to access it. - -The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can -handle this if they want to. - -This is only active on architectures/platforms with advanced machine -check handling and depends on the hardware capabilities. - ============================================================== min_free_kbytes: --- sound-2.6.orig/include/linux/mm.h +++ sound-2.6/include/linux/mm.h @@ -1331,7 +1331,6 @@ extern int account_locked_memory(struct extern void refund_locked_memory(struct mm_struct *mm, size_t size); extern void memory_failure(unsigned long pfn, int trapno); -extern int sysctl_memory_failure_early_kill; extern atomic_long_t mce_bad_pages; #endif /* __KERNEL__ */ --- sound-2.6.orig/kernel/sysctl.c +++ sound-2.6/kernel/sysctl.c @@ -1319,19 +1319,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, -#ifdef CONFIG_MEMORY_FAILURE - { - .ctl_name = CTL_UNNUMBERED, - .procname = "memory_failure_early_kill", - .data = &sysctl_memory_failure_early_kill, - .maxlen = sizeof(sysctl_memory_failure_early_kill), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one, - }, -#endif /* * NOTE: do not add new entries to this table unless you have read --- sound-2.6.orig/mm/filemap.c +++ sound-2.6/mm/filemap.c @@ -105,10 +105,6 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) - * - * (code doesn't rely on that order, so you could switch it around) - * ->tasklist_lock (memory_failure, collect_procs_ao) - * ->i_mmap_lock */ /* --- sound-2.6.orig/mm/rmap.c +++ sound-2.6/mm/rmap.c @@ -36,10 +36,6 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) - * - * (code doesn't rely on that order so it could be switched around) - * ->tasklist_lock - * anon_vma->lock (memory_failure, collect_procs_anon) */ #include @@ -195,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +static struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -215,7 +211,7 @@ out: return NULL; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +static void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); --- sound-2.6.orig/include/linux/rmap.h +++ sound-2.6/include/linux/rmap.h @@ -129,12 +129,6 @@ int try_to_munlock(struct page *); int page_wrprotect(struct page *page, int *odirect_sync, int count_offset); #endif -/* - * Called by memory-failure.c to kill processes. - */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); - #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/