This allows the user space to do some flexible policies. For example, it may either do emergency sync/shutdown or to schedule reboot at some convenient time, depending on the severeness of the corruption. Signed-off-by: Wu Fengguang --- Documentation/vm/memory-failure | 68 ++++++++++++++++++ mm/memory-failure.c | 110 +++++++++++++++++++++++++++++- 2 files changed, 175 insertions(+), 3 deletions(-) --- /dev/null +++ sound-2.6/Documentation/vm/memory-failure @@ -0,0 +1,68 @@ +Memory failure and hardware poison events + +Memory may have soft errors and the more memory you have the more errors. +Normally hardware hides that from you by correcting it, but in some cases you +can get multi-bit errors which lead to uncorrected errors the hardware cannot +hide. + +This does not necessarily mean that the hardware is broken; for example it can +be caused by cosmic particles hitting a unlucky transistor. So it can really +happen in normal operation. + +Some hardwares (eg. Nehalem-EX) support background memory scrubbing in order to +report the memory corruption before they are consumed. The kernel will then try +to isolate the corrupted memory page, restore data, and finally send a uevent +to the user space. + +A memory poison uevent will be + + # udevadm monitor --environment --kernel + KERNEL[1245030313.702625] change /kernel/mm/hwpoison/hwpoison (hwpoison) + UDEV_LOG=3 + ACTION=change + DEVPATH=/kernel/mm/hwpoison/hwpoison + SUBSYSTEM=hwpoison + EVENT=poison + PHYS_ADDR=0x19e1c000 + PAGE_FLAGS=0x80008083c + PAGE_COUNT=3 + PAGE_MAPCOUNT=1 + PAGE_DEV=8:2 + PAGE_INODE=56169 + PAGE_INDEX=9 + PAGE_TYPE=file_data + PAGE_ISOLATED=1 + DATA_RECOVERABLE=0 + SEQNUM=2109 + +where + + PHYS_ADDR the physical page address + PAGE_FLAGS the kpageflags bits defined at Documentation/vm/pagemap.txt + PAGE_COUNT the original page reference count + PAGE_MAPCOUNT the original page map count + + PAGE_TYPE where the error lands, can be one of + "kernel" - a kernel page that may contain some critical data structure + "fs_metadata" - a filesystem metadata page + "file_data" - a file data page + "anon_data" - a page belong to some process(es) + "swap_cache" - it's in the swap cache; the kernel cannot tell if it was an + anon_data page or a tmpfs' file_data page + "free" - a free page; not used by anyone + +For "file_data" pages, the following three vars are available: + + PAGE_DEV the file's MAJOR:MINOR device numbers in decimal + PAGE_INODE the file's inode number in decimal + PAGE_INDEX the file offset in page size + + PAGE_ISOLATED if 1, we are sure that the page won't be consumed in the future. + if 0, the error page is still referenced by someone, and may be + consumed at anytime, which will be detected/stopped by hardware, + and trigger instant machine reboot. + + DATA_RECOVERABLE if 1, no data are lost. For example, it's a free page, or a + clean page whose data can be reloaded from disk. In these + cases, the user space will not see the error at all. + --- sound-2.6.orig/mm/memory-failure.c +++ sound-2.6/mm/memory-failure.c @@ -330,7 +330,11 @@ static const char *hwpoison_page_type_na [ PAGE_IS_FREE ] = "free", }; +static struct kset *hwpoison_kset; +static struct kobject hwpoison_kobj; + struct hwpoison_control { + struct kobj_uevent_env *env; unsigned long pfn; struct page *p; /* corrupted page */ struct page *page; /* compound page head */ @@ -340,6 +344,51 @@ struct hwpoison_control { unsigned page_isolated:1; }; +static void hwpoison_uevent_page(struct hwpoison_control *hpc) +{ + struct page *p = hpc->page; + + if (hpc->env == NULL) + return; + + add_uevent_var(hpc->env, "EVENT=poison"); + add_uevent_var(hpc->env, "PHYS_ADDR=%#lx", hpc->pfn << PAGE_SHIFT); + add_uevent_var(hpc->env, "PAGE_FLAGS=%#Lx", page_uflags(p)); + add_uevent_var(hpc->env, "PAGE_COUNT=%d", page_count(p)); + add_uevent_var(hpc->env, "PAGE_MAPCOUNT=%d", page_mapcount(p)); +} + +static void hwpoison_uevent_file(struct hwpoison_control *hpc) +{ + struct address_space *mapping = page_mapping(hpc->page); + + if (hpc->env == NULL) + return; + + if (!mapping || !mapping->host) + return; + + add_uevent_var(hpc->env, "PAGE_DEV=%d:%d", + MAJOR(mapping->host->i_sb->s_dev), + MINOR(mapping->host->i_sb->s_dev)); + add_uevent_var(hpc->env, "PAGE_INODE=%lu", mapping->host->i_ino); + add_uevent_var(hpc->env, "PAGE_INDEX=%lu", hpc->page->index); +} + +static void hwpoison_uevent_send(struct hwpoison_control *hpc) +{ + if (hpc->env == NULL) + return; + + add_uevent_var(hpc->env, "PAGE_TYPE=%s", + hwpoison_page_type_name[hpc->page_type]); + add_uevent_var(hpc->env, "PAGE_ISOLATED=%d", + hpc->page_isolated); + add_uevent_var(hpc->env, "DATA_RECOVERABLE=%d", + hpc->data_recoverable); + kobject_uevent_env(&hwpoison_kobj, KOBJ_CHANGE, hpc->env->envp); +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we @@ -769,10 +818,19 @@ void memory_failure(unsigned long pfn, i return; } + hpc.env = kzalloc(sizeof(struct kobj_uevent_env), GFP_NOIO); + if (!hpc.env) { + printk(KERN_ERR + "MCE %#lx: cannot allocate memory for uevent\n", + pfn); + } + hpc.pfn = pfn; hpc.p = p; hpc.page = p = compound_head(p); + hwpoison_uevent_page(&hpc); + hpc.page_type = PAGE_IS_KERNEL; hpc.data_recoverable = 0; hpc.page_isolated = 0; @@ -796,7 +854,7 @@ void memory_failure(unsigned long pfn, i action_result(&hpc, "free buddy", DELAYED); } else action_result(&hpc, "high order kernel", IGNORED); - return; + goto out; } /* @@ -825,16 +883,62 @@ void memory_failure(unsigned long pfn, i if (!PageSwapCache(p) && p->mapping == NULL) { action_result(&hpc, "already truncated LRU", IGNORED); hpc.page_type = PAGE_IS_FREE; - goto out; + goto out_unlock; } } + hwpoison_uevent_file(&hpc); + for (ps = error_states;; ps++) { if ((p->flags & ps->mask) == ps->res) { page_action(ps, &hpc); break; } } -out: +out_unlock: unlock_page(p); +out: + hwpoison_uevent_send(&hpc); +} + +static void hwpoison_release(struct kobject *kobj) +{ +} + +static struct kobj_type hwpoison_ktype = { + .release = hwpoison_release, +}; + +static int hwpoison_kobj_init(void) +{ + int err; + + hwpoison_kset = kset_create_and_add("hwpoison", NULL, mm_kobj); + if (!hwpoison_kset) + return -ENOMEM; + + hwpoison_kobj.kset = hwpoison_kset; + + err = kobject_init_and_add(&hwpoison_kobj, &hwpoison_ktype, NULL, + "hwpoison"); + if (err) + return -ENOMEM; + + kobject_uevent(&hwpoison_kobj, KOBJ_ADD); + + return 0; } + + +static int __init hwpoison_init(void) +{ + return hwpoison_kobj_init(); +} + +static void __exit hwpoison_exit(void) +{ + kset_unregister(hwpoison_kset); +} + +module_init(hwpoison_init); +module_exit(hwpoison_exit); -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/