Notice swap write errors during page reclaim, and deallocate the swap entry which is backing the swapcache. This allows the page error to be cleared and the page be allocated to a new entry, rather than the page to becoming pinned forever. Based on code from Richard Purdie Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c 2006-11-01 14:58:28.000000000 +1100 +++ linux-2.6/mm/shmem.c 2006-11-01 16:10:21.000000000 +1100 @@ -729,6 +729,8 @@ static int shmem_unuse_inode(struct shme struct page *subdir; swp_entry_t *ptr; int offset; + int moved; + int error; idx = 0; ptr = info->i_direct; @@ -786,17 +788,20 @@ lost2: found: idx += offset; inode = &info->vfs_inode; - if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) { + error = PageError(page); + moved = (move_from_swap_cache(page, idx, inode->i_mapping) == 0); + if (moved) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, ptr + offset, 0); } shmem_swp_unmap(ptr); spin_unlock(&info->lock); - /* - * Decrement swap count even when the entry is left behind: - * try_to_unuse will skip over mms, then reincrement count. - */ - swap_free(entry); + if (moved) { + if (!error) + swap_free(entry); + else + swap_free_markbad(entry); + } return 1; } Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c 2006-11-01 14:58:28.000000000 +1100 +++ linux-2.6/mm/swapfile.c 2006-11-01 16:14:26.000000000 +1100 @@ -304,6 +304,23 @@ void swap_free(swp_entry_t entry) } } +void swap_free_markbad(swp_entry_t entry) +{ + struct swap_info_struct * p; + + p = swap_info_get(entry); + if (p) { + unsigned long offset = swp_offset(entry); + if (swap_entry_free(p, offset) == 0) { + p->swap_map[offset] = SWAP_MAP_BAD; + p->pages--; + nr_swap_pages--; + total_swap_pages--; + } + spin_unlock(&swap_lock); + } +} + /* * How many references to page are currently swapped out? */ @@ -618,6 +635,207 @@ static int unuse_mm(struct mm_struct *mm return 0; } +static int try_to_unuse_entry(swp_entry_t entry, unsigned short *swap_map, + struct page *page, struct mm_struct **start_mm_p) +{ + struct mm_struct *start_mm; + unsigned short swcount; + int retval = 0; + int shmem; + + if (start_mm_p) + start_mm = *start_mm_p; + else { + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&start_mm->mm_users) == 1) { + mmput(start_mm); + start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with try_to_unuse, + * do_swap_page can handle the fault much faster than try_to_unuse can + * locate the entry. This apparently redundant "wait_on_page_locked" + * lets try_to_unuse defer to do_swap_page in such a case - in some + * tests, do_swap_page and try_to_unuse repeatedly compete. + */ +retry: + wait_on_page_locked(page); + wait_on_page_writeback(page); + lock_page(page); + wait_on_page_writeback(page); + + /* + * Remove all references to entry. + * Whenever we reach init_mm, there's no address space to search, but + * use it as a reminder to search shmem. + */ + shmem = 0; + swcount = *swap_map; + if (swcount > 1) { + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else + retval = unuse_mm(start_mm, entry, page); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &start_mm->mmlist; + struct mm_struct *new_start_mm = start_mm; + struct mm_struct *prev_mm = start_mm; + struct mm_struct *mm; + + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); + spin_lock(&mmlist_lock); + while (*swap_map > 1 && !retval && + (p = p->next) != &start_mm->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + if (!atomic_inc_not_zero(&mm->mm_users)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + + swcount = *swap_map; + if (swcount <= 1) + ; + else if (mm == &init_mm) { + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else + retval = unuse_mm(mm, entry, page); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); + new_start_mm = mm; + set_start_mm = 0; + } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + mmput(start_mm); + start_mm = new_start_mm; + } + if (retval) { + unlock_page(page); + page_cache_release(page); + goto out; + } + + /* + * How could swap count reach 0x7fff when the maximum pid is 0x7fff, + * and there's no way to repeat a swap page within an mm (except in + * shmem, where it's the shared object which takes the reference + * count)? We believe SWAP_MAP_MAX cannot occur in Linux 2.4. + * + * If that's wrong, then we should worry more about exit_mmap() and + * do_munmap() cases described above: we might be resetting + * SWAP_MAP_MAX too early here. We know "Undead"s can happen, they're + * okay, so don't report them; but do report if we reset SWAP_MAP_MAX. + */ + if (*swap_map == SWAP_MAP_MAX) { + spin_lock(&swap_lock); + *swap_map = 1; + spin_unlock(&swap_lock); + if (printk_ratelimit()) + printk(KERN_WARNING + "try_to_unuse_entry: cleared swap entry overflow\n"); + } + + /* + * If a reference remains (rare), we would like to leave the page in + * the swap cache; but try_to_unmap could then re-duplicate the entry + * once we drop page lock, so we might loop indefinitely; also, that + * page could not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, after ensuring + * that the data has been saved to disk - since if the reference + * remains (rarer), it will be read from disk into another page. + * Splitting into two pages would be incorrect if swap supported + * "shared private" pages, but they are handled by tmpfs files. + * + * Note shmem_unuse already deleted a swappage from the swap cache, + * unless the move to filepage failed: in which case it left swappage + * in cache, lowered its swap count to pass quickly through the loops + * above, and now we must reincrement count to try again later. + */ + if (PageSwapCache(page)) { + if ((*swap_map > 1) && PageDirty(page)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); + goto retry; + } + if (!shmem) { + int error = PageError(page); + + write_lock_irq(&swapper_space.tree_lock); + __delete_from_swap_cache(page); + write_unlock_irq(&swapper_space.tree_lock); + page_cache_release(page); /* the swapcache ref */ + + if (!error) + swap_free(entry); + else + swap_free_markbad(entry); + } + } + + /* + * So we could skip searching mms once swap count went to 1, we did not + * mark any present ptes as dirty: must mark page dirty so shrink_list + * will preserve it. + */ + SetPageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (start_mm_p) + *start_mm_p = start_mm; + else + mmput(start_mm); + +out: + return retval; +} + +void try_to_unuse_page_entry(struct page *page) +{ + struct swap_info_struct *si; + unsigned short *swap_map; + swp_entry_t entry; + + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(PagePrivate(page)); + + entry.val = page_private(page); + si = swap_info_get(entry); + if (!si) { + WARN_ON(1); + return; + } + swap_map = &si->swap_map[swp_offset(entry)]; + spin_unlock(&swap_lock); + + BUG_ON(*swap_map == SWAP_MAP_BAD); + + try_to_unuse_entry(entry, swap_map, page, NULL); +} + /* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. @@ -666,13 +884,10 @@ static int try_to_unuse(unsigned int typ struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; - unsigned short swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; - int reset_overflow = 0; - int shmem; /* * When searching mms for an entry, a good strategy is to @@ -724,152 +939,10 @@ static int try_to_unuse(unsigned int typ break; } - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - } - - /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. - */ - shmem = 0; - swcount = *swap_map; - if (swcount > 1) { - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - retval = unuse_mm(start_mm, entry, page); - } - if (*swap_map > 1) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - atomic_inc(&new_start_mm->mm_users); - atomic_inc(&prev_mm->mm_users); - spin_lock(&mmlist_lock); - while (*swap_map > 1 && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!atomic_inc_not_zero(&mm->mm_users)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; - - cond_resched(); - - swcount = *swap_map; - if (swcount <= 1) - ; - else if (mm == &init_mm) { - set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else - retval = unuse_mm(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - atomic_inc(&mm->mm_users); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } - if (retval) { - unlock_page(page); - page_cache_release(page); + retval = try_to_unuse_entry(entry, swap_map, page, + &start_mm); + if (retval) break; - } - - /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - if (*swap_map == SWAP_MAP_MAX) { - spin_lock(&swap_lock); - *swap_map = 1; - spin_unlock(&swap_lock); - reset_overflow = 1; - } - - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Note shmem_unuse already deleted a swappage from - * the swap cache, unless the move to filepage failed: - * in which case it left swappage in cache, lowered its - * swap count to pass quickly through the loops above, - * and now we must reincrement count to try again later. - */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(page, &wbc); - lock_page(page); - wait_on_page_writeback(page); - } - if (PageSwapCache(page)) { - if (shmem) - swap_duplicate(entry); - else - delete_from_swap_cache(page); - } - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_list will preserve it. - */ - SetPageDirty(page); - unlock_page(page); - page_cache_release(page); /* * Make sure that we aren't completely killing @@ -879,10 +952,6 @@ static int try_to_unuse(unsigned int typ } mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } return retval; } Index: linux-2.6/mm/swap_state.c =================================================================== --- linux-2.6.orig/mm/swap_state.c 2006-11-01 15:11:54.000000000 +1100 +++ linux-2.6/mm/swap_state.c 2006-11-01 15:51:20.000000000 +1100 @@ -131,6 +131,8 @@ void __delete_from_swap_cache(struct pag radix_tree_delete(&swapper_space.page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); + if (unlikely(PageError(page))) + ClearPageError(page); total_swapcache_pages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); Index: linux-2.6/include/linux/swap.h =================================================================== --- linux-2.6.orig/include/linux/swap.h 2006-11-01 15:54:10.000000000 +1100 +++ linux-2.6/include/linux/swap.h 2006-11-01 16:10:44.000000000 +1100 @@ -246,6 +246,7 @@ extern swp_entry_t get_swap_page_of_type extern int swap_duplicate(swp_entry_t); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); +extern void swap_free_markbad(swp_entry_t); extern void free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t); extern unsigned int count_swap_pages(int, int); @@ -253,6 +254,7 @@ extern sector_t map_swap_page(struct swa extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +extern void try_to_unuse_page_entry(struct page *page); struct backing_dev_info; extern spinlock_t swap_lock; Index: linux-2.6/mm/page_io.c =================================================================== --- linux-2.6.orig/mm/page_io.c 2006-11-01 15:52:40.000000000 +1100 +++ linux-2.6/mm/page_io.c 2006-11-01 15:52:46.000000000 +1100 @@ -59,15 +59,12 @@ static int end_swap_bio_write(struct bio * Re-dirty the page in order to avoid it being reclaimed. * Also print a dire warning that things will go BAD (tm) * very quickly. - * - * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); - ClearPageReclaim(page); } end_page_writeback(page); bio_put(bio); Index: linux-2.6/mm/vmscan.c =================================================================== --- linux-2.6.orig/mm/vmscan.c 2006-11-01 15:53:29.000000000 +1100 +++ linux-2.6/mm/vmscan.c 2006-11-01 16:07:41.000000000 +1100 @@ -486,9 +486,21 @@ static unsigned long shrink_page_list(st * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) - if (!add_to_swap(page, GFP_ATOMIC)) - goto activate_locked; + if (PageAnon(page)) { + /* + * Encountered an error last time? Try to remove the + * page from its current position, which will notice + * the error and mark that swap entry bad. Then we can + * try allocating another swap entry. + */ + if (PageSwapCache(page) && unlikely(PageError(page))) + try_to_unuse_page_entry(page); + + if (!PageSwapCache(page)) { + if (!add_to_swap(page, GFP_ATOMIC)) + goto activate_locked; + } + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page);