The conversion to a rwsem allows callbacks during rmap traversal for files in a non atomic context. A rw style lock also allows concurrent walking of the reverse map. This is fairly straightforward if one removes pieces of the resched checking. [Restarting unmapping is an issue to be discussed]. This slightly increases Aim9 performance results on an 8p. Signed-off-by: Andrea Arcangeli Signed-off-by: Christoph Lameter --- arch/x86/mm/hugetlbpage.c | 4 ++-- fs/hugetlbfs/inode.c | 4 ++-- fs/inode.c | 2 +- include/linux/fs.h | 2 +- include/linux/mm.h | 2 +- kernel/fork.c | 4 ++-- mm/filemap.c | 8 ++++---- mm/filemap_xip.c | 4 ++-- mm/fremap.c | 4 ++-- mm/hugetlb.c | 10 +++++----- mm/memory.c | 29 +++++++++-------------------- mm/migrate.c | 4 ++-- mm/mmap.c | 43 ++++++++++++++++++++++--------------------- mm/mremap.c | 4 ++-- mm/rmap.c | 20 +++++++++----------- 15 files changed, 66 insertions(+), 78 deletions(-) Index: linux-2.6/arch/x86/mm/hugetlbpage.c =================================================================== --- linux-2.6.orig/arch/x86/mm/hugetlbpage.c 2008-04-02 11:41:47.601676490 -0700 +++ linux-2.6/arch/x86/mm/hugetlbpage.c 2008-04-04 15:09:11.715211829 -0700 @@ -69,7 +69,7 @@ static void huge_pmd_share(struct mm_str if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -94,7 +94,7 @@ static void huge_pmd_share(struct mm_str put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* Index: linux-2.6/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.orig/fs/hugetlbfs/inode.c 2008-04-02 11:41:47.605676583 -0700 +++ linux-2.6/fs/hugetlbfs/inode.c 2008-04-04 15:09:11.743212273 -0700 @@ -454,10 +454,10 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); truncate_hugepages(inode, offset); return 0; } Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c 2008-04-02 11:41:47.613676625 -0700 +++ linux-2.6/fs/inode.c 2008-04-04 15:09:11.755212477 -0700 @@ -210,7 +210,7 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); + init_rwsem(&inode->i_data.i_mmap_sem); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h 2008-04-02 11:41:47.621676899 -0700 +++ linux-2.6/include/linux/fs.h 2008-04-04 15:09:11.755212477 -0700 @@ -503,7 +503,7 @@ struct address_space { unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ - spinlock_t i_mmap_lock; /* protect tree, count, list */ + struct rw_semaphore i_mmap_sem; /* protect tree, count, list */ unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h 2008-04-04 15:09:11.687211361 -0700 +++ linux-2.6/include/linux/mm.h 2008-04-04 15:09:45.883767696 -0700 @@ -716,7 +716,7 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ + struct rw_semaphore *i_mmap_sem; /* For unmap_mapping_range: */ unsigned long truncate_count; /* Compare vm_truncate_count */ }; Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c 2008-04-04 15:07:38.857699751 -0700 +++ linux-2.6/kernel/fork.c 2008-04-04 15:09:11.759212563 -0700 @@ -273,12 +273,12 @@ static int dup_mmap(struct mm_struct *mm atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + down_write(&file->f_mapping->i_mmap_sem); tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(file->f_mapping); vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + up_write(&file->f_mapping->i_mmap_sem); } /* Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c 2008-04-02 11:41:47.641677219 -0700 +++ linux-2.6/mm/filemap.c 2008-04-04 15:09:44.663747838 -0700 @@ -61,16 +61,16 @@ generic_file_direct_IO(int rw, struct ki /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_sem (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_sem (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_sem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * @@ -87,7 +87,7 @@ generic_file_direct_IO(int rw, struct ki * ->sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_sem * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock Index: linux-2.6/mm/filemap_xip.c =================================================================== --- linux-2.6.orig/mm/filemap_xip.c 2008-04-04 15:07:38.861699817 -0700 +++ linux-2.6/mm/filemap_xip.c 2008-04-04 15:09:11.767212672 -0700 @@ -184,7 +184,7 @@ __xip_unmap (struct address_space * mapp if (!page) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + @@ -206,7 +206,7 @@ __xip_unmap (struct address_space * mapp emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE); } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2008-04-04 15:07:38.861699817 -0700 +++ linux-2.6/mm/fremap.c 2008-04-04 15:09:11.767212672 -0700 @@ -205,13 +205,13 @@ asmlinkage long sys_remap_file_pages(uns } goto out; } - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); flush_dcache_mmap_lock(mapping); vma->vm_flags |= VM_NONLINEAR; vma_prio_tree_remove(vma, &mapping->i_mmap); vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } emm_notify(mm, emm_invalidate_start, start, end); Index: linux-2.6/mm/hugetlb.c =================================================================== --- linux-2.6.orig/mm/hugetlb.c 2008-04-04 15:07:38.861699817 -0700 +++ linux-2.6/mm/hugetlb.c 2008-04-04 15:09:11.771212752 -0700 @@ -790,7 +790,7 @@ void __unmap_hugepage_range(struct vm_ar struct page *page; struct page *tmp; /* - * A page gathering list, protected by per file i_mmap_lock. The + * A page gathering list, protected by per file i_mmap_sem. The * lock is used to avoid list corruption from multiple unmapping * of the same page since we are using page->lru. */ @@ -840,9 +840,9 @@ void unmap_hugepage_range(struct vm_area * do nothing in this case. */ if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); __unmap_hugepage_range(vma, start, end); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); } } @@ -1085,7 +1085,7 @@ void hugetlb_change_protection(struct vm BUG_ON(address >= end); flush_cache_range(vma, address, end); - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + down_write(&vma->vm_file->f_mapping->i_mmap_sem); spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -1100,7 +1100,7 @@ void hugetlb_change_protection(struct vm } } spin_unlock(&mm->page_table_lock); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + up_write(&vma->vm_file->f_mapping->i_mmap_sem); flush_tlb_range(vma, start, end); } Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2008-04-04 15:09:11.687211361 -0700 +++ linux-2.6/mm/memory.c 2008-04-04 15:09:45.887767772 -0700 @@ -839,7 +839,6 @@ unsigned long unmap_vmas(struct mmu_gath unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { @@ -876,22 +875,12 @@ unsigned long unmap_vmas(struct mmu_gath } tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - (i_mmap_lock && spin_needbreak(i_mmap_lock))) { - if (i_mmap_lock) { - *tlbp = NULL; - goto out; - } - cond_resched(); - } - + cond_resched(); *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } -out: return start; /* which is now the end (or restart) address */ } @@ -1757,7 +1746,7 @@ unwritable_page: /* * Helper functions for unmap_mapping_range(). * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * __ Notes on dropping i_mmap_sem to reduce latency while unmapping __ * * We have to restart searching the prio_tree whenever we drop the lock, * since the iterator is only valid while the lock is held, and anyway @@ -1776,7 +1765,7 @@ unwritable_page: * can't efficiently keep all vmas in step with mapping->truncate_count: * so instead reset them all whenever it wraps back to 0 (then go to 1). * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. + * i_mmap_sem. * * In order to make forward progress despite repeatedly restarting some * large vma, note the restart_addr from unmap_vmas when it breaks out: @@ -1826,7 +1815,7 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - need_break = need_resched() || spin_needbreak(details->i_mmap_lock); + need_break = need_resched(); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ @@ -1840,9 +1829,9 @@ again: goto again; } - spin_unlock(details->i_mmap_lock); + up_write(details->i_mmap_sem); cond_resched(); - spin_lock(details->i_mmap_lock); + down_write(details->i_mmap_sem); return -EINTR; } @@ -1936,9 +1925,9 @@ void unmap_mapping_range(struct address_ details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; + details.i_mmap_sem = &mapping->i_mmap_sem; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); /* Protect against endless unmapping loops */ mapping->truncate_count++; @@ -1953,7 +1942,7 @@ void unmap_mapping_range(struct address_ unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } EXPORT_SYMBOL(unmap_mapping_range); Index: linux-2.6/mm/migrate.c =================================================================== --- linux-2.6.orig/mm/migrate.c 2008-04-02 11:41:47.673677614 -0700 +++ linux-2.6/mm/migrate.c 2008-04-04 15:09:45.443760619 -0700 @@ -211,12 +211,12 @@ static void remove_file_migration_ptes(s if (!mapping) return; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) remove_migration_pte(vma, old, new); - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); } /* Index: linux-2.6/mm/mmap.c =================================================================== --- linux-2.6.orig/mm/mmap.c 2008-04-04 15:09:11.687211361 -0700 +++ linux-2.6/mm/mmap.c 2008-04-04 15:13:59.643887398 -0700 @@ -186,7 +186,7 @@ error: } /* - * Requires inode->i_mapping->i_mmap_lock + * Requires inode->i_mapping->i_mmap_sem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) @@ -214,9 +214,9 @@ void unlink_file_vma(struct vm_area_stru if (file) { struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); } } @@ -439,7 +439,7 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_mapping; if (mapping) { - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); vma->vm_truncate_count = mapping->truncate_count; } anon_vma_lock(vma); @@ -449,7 +449,7 @@ static void vma_link(struct mm_struct *m anon_vma_unlock(vma); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); mm->map_count++; validate_mm(mm); @@ -536,7 +536,7 @@ again: remove_next = 1 + (end > next-> mapping = file->f_mapping; if (!(vma->vm_flags & VM_NONLINEAR)) root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (importer && vma->vm_truncate_count != next->vm_truncate_count) { /* @@ -620,7 +620,7 @@ again: remove_next = 1 + (end > next-> if (anon_vma) spin_unlock(&anon_vma->lock); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); if (remove_next) { if (file) @@ -2064,7 +2064,7 @@ void exit_mmap(struct mm_struct *mm) /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. + * then i_mmap_sem is taken here. */ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) { @@ -2249,28 +2249,29 @@ int install_special_mapping(struct mm_st static void mm_lock_unlock(struct mm_struct *mm, int lock) { struct vm_area_struct *vma; - spinlock_t *i_mmap_lock_last, *anon_vma_lock_last; + struct rw_semaphore *i_mmap_sem_last; + spinlock_t *anon_vma_lock_last; - i_mmap_lock_last = NULL; + i_mmap_sem_last = NULL; for (;;) { - spinlock_t *i_mmap_lock = (spinlock_t *) -1UL; + struct rw_semaphore *i_mmap_sem = (struct rw_semaphore *) -1UL; for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_file && vma->vm_file->f_mapping && - (unsigned long) i_mmap_lock > + (unsigned long) i_mmap_sem > (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock && + &vma->vm_file->f_mapping->i_mmap_sem && (unsigned long) - &vma->vm_file->f_mapping->i_mmap_lock > - (unsigned long) i_mmap_lock_last) - i_mmap_lock = - &vma->vm_file->f_mapping->i_mmap_lock; - if (i_mmap_lock == (spinlock_t *) -1UL) + &vma->vm_file->f_mapping->i_mmap_sem > + (unsigned long) i_mmap_sem) + i_mmap_sem = + &vma->vm_file->f_mapping->i_mmap_sem; + if (i_mmap_sem == (struct rw_semaphore *) -1UL) break; - i_mmap_lock_last = i_mmap_lock; + i_mmap_sem_last = i_mmap_sem; if (lock) - spin_lock(i_mmap_lock); + down_write(i_mmap_sem); else - spin_unlock(i_mmap_lock); + up_write(i_mmap_sem); } anon_vma_lock_last = NULL; Index: linux-2.6/mm/mremap.c =================================================================== --- linux-2.6.orig/mm/mremap.c 2008-04-04 15:07:38.861699817 -0700 +++ linux-2.6/mm/mremap.c 2008-04-04 15:09:11.795213130 -0700 @@ -86,7 +86,7 @@ static void move_ptes(struct vm_area_str * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; - spin_lock(&mapping->i_mmap_lock); + down_write(&mapping->i_mmap_sem); if (new_vma->vm_truncate_count && new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; @@ -118,7 +118,7 @@ static void move_ptes(struct vm_area_str pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); emm_notify(mm, emm_invalidate_end, old_start, old_end); } Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2008-04-04 15:08:56.630966343 -0700 +++ linux-2.6/mm/rmap.c 2008-04-04 15:09:45.451760720 -0700 @@ -24,7 +24,7 @@ * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_lock + * mapping->i_mmap_sem * anon_vma->lock * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) @@ -450,14 +450,14 @@ static int page_referenced_file(struct p * The page lock not only makes sure that page->mapping cannot * suddenly be NULLified by truncation, it makes sure that the * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_lock. + * so we can safely take mapping->i_mmap_sem. */ BUG_ON(!PageLocked(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); /* - * i_mmap_lock does not stabilize mapcount at all, but mapcount + * i_mmap_sem does not stabilize mapcount at all, but mapcount * is more likely to be accurate if we note it after spinning. */ mapcount = page_mapcount(page); @@ -480,7 +480,7 @@ static int page_referenced_file(struct p break; } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return referenced; } @@ -566,12 +566,12 @@ static int page_mkclean_file(struct addr BUG_ON(PageAnon(page)); - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_flags & VM_SHARED) ret += page_mkclean_one(page, vma); } - spin_unlock(&mapping->i_mmap_lock); + up_read(&mapping->i_mmap_sem); return ret; } @@ -1010,7 +1010,7 @@ static int try_to_unmap_file(struct page unsigned long max_nl_size = 0; unsigned int mapcount; - spin_lock(&mapping->i_mmap_lock); + down_read(&mapping->i_mmap_sem); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { ret = try_to_unmap_one(page, vma, migration); if (ret == SWAP_FAIL || !page_mapped(page)) @@ -1047,7 +1047,6 @@ static int try_to_unmap_file(struct page mapcount = page_mapcount(page); if (!mapcount) goto out; - cond_resched_lock(&mapping->i_mmap_lock); max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; if (max_nl_cursor == 0) @@ -1069,7 +1068,6 @@ static int try_to_unmap_file(struct page } vma->vm_private_data = (void *) max_nl_cursor; } - cond_resched_lock(&mapping->i_mmap_lock); max_nl_cursor += CLUSTER_SIZE; } while (max_nl_cursor <= max_nl_size); @@ -1081,7 +1079,7 @@ static int try_to_unmap_file(struct page list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_private_data = NULL; out: - spin_unlock(&mapping->i_mmap_lock); + up_write(&mapping->i_mmap_sem); return ret; } -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/