[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1329402705-25454-10-git-send-email-mail@smogura.eu>
Date: Thu, 16 Feb 2012 15:31:37 +0100
From: Radosław Smogura <mail@...gura.eu>
To: linux-mm@...ck.org
Cc: Yongqiang Yang <xiaoqiangnk@...il.com>, mail@...gura.eu,
linux-ext4@...r.kernel.org
Subject: [PATCH 10/18] Support for huge page faulting
Adds some basic vm routines and macros to operate on huge page
cache, designed to proper faulting of huge pages.
1. __do_fault - made it common for huge and small.
2. Simple wrappers for huge pages for rmapping.
3. Other changes.
Signed-off-by: Radosław Smogura <mail@...gura.eu>
---
include/linux/defrag-pagecache.h | 18 +--
include/linux/fs.h | 19 +-
include/linux/mm.h | 28 ++
include/linux/mm_types.h | 2 +-
include/linux/rmap.h | 9 +
mm/huge_memory.c | 42 +++
mm/memory.c | 528 +++++++++++++++++++++++++++++++-------
mm/page-writeback.c | 31 +++
mm/rmap.c | 29 ++
9 files changed, 582 insertions(+), 124 deletions(-)
diff --git a/include/linux/defrag-pagecache.h b/include/linux/defrag-pagecache.h
index 46793de..4ca3468 100644
--- a/include/linux/defrag-pagecache.h
+++ b/include/linux/defrag-pagecache.h
@@ -8,7 +8,7 @@
#ifndef DEFRAG_PAGECACHE_H
#define DEFRAG_PAGECACHE_H
-#include <linux/fs.h>
+#include <linux/defrag-pagecache.h>
/* XXX Split this file into two public and protected - comments below
* Protected will contain
@@ -24,22 +24,6 @@ typedef struct page *defrag_generic_get_page(
const struct defrag_pagecache_ctl *ctl, struct inode *inode,
pgoff_t pageIndex);
-/** Passes additional information and controls to page defragmentation. */
-struct defrag_pagecache_ctl {
- /** If yes defragmentation will try to fill page caches. */
- char fillPages:1;
-
- /** If filling of page fails, defragmentation will fail too. Setting
- * this requires {@...k #fillPages} will be setted.
- */
- char requireFillPages:1;
-
- /** If yes defragmentation will try to force in many aspects, this may
- * cause, operation to run longer, but with greater probability of
- * success. */
- char force:1;
-};
-
/** Defragments page cache of specified file and migrates it's to huge pages.
*
* @param f
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bfd9122..7288166 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,10 +10,7 @@
#include <linux/ioctl.h>
#include <linux/blk_types.h>
#include <linux/types.h>
-
-#ifdef CONFIG_HUGEPAGECACHE
-#include <linux/defrag-pagecache.h>
-#endif
+#include <linux/defrag-pagecache-base.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
@@ -596,6 +593,9 @@ struct address_space_operations {
/* Set a page dirty. Return true if this dirtied it */
int (*set_page_dirty)(struct page *page);
+ /** Same as \a set_page_dirty but for huge page */
+ int (*set_page_dirty_huge)(struct page *page);
+
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
@@ -606,7 +606,6 @@ struct address_space_operations {
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-#ifdef CONFIG_HUGEPAGECACHE
/** Used to defrag (migrate) pages at position {@...e pos}
* to huge pages. Having this not {@...e NULL} will indicate that
* address space, generally, supports huge pages (transaprent
@@ -616,15 +615,19 @@ struct address_space_operations {
*
* @param pagep on success will be setted to established huge page
*
- * @returns TODO What to return?
- * {@...e 0} on success, value less then {@...e 0} on error
+ * @returns {@...e 0} on success, value less then {@...e 0} on error
*/
int (*defragpage) (struct file *, struct address_space *mapping,
loff_t pos,
struct page **pagep,
const struct defrag_pagecache_ctl *ctl);
-#endif
+ /** Used to split page, this method may be called under memory
+ * preasure. Actaully, You should not split page.
+ */
+ int (*split_page) (struct file *file, struct address_space *mapping,
+ loff_t pos, struct page *hueg_page);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 72f6a50..27a10c8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,10 +206,19 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ /** Same as \a fault but should return huge page, instead of single one.
+ * If function fails, then caller may try again with fault.
+ */
+ int (*fault_huge)(struct vm_area_struct *vma, struct vm_fault *vmf);
+
/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ /** Same as \a page_mkwrite, but for huge page. */
+ int (*page_mkwrite_huge)(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
/* called by access_process_vm when get_user_pages() fails, typically
* for use by special VMAs that can switch between memory and hardware
*/
@@ -534,6 +543,16 @@ static inline void get_page(struct page *page)
}
}
+/** Bumps tail pages usage count. If there is at least one page that do not have
+ * valid mapping page count is left untoach.
+ */
+extern void get_page_tails_for_fmap(struct page *head);
+
+/** Decrease tail pages usage count.
+ * This function assumes you have getted compound or forozen compound.
+ */
+extern void put_page_tails_for_fmap(struct page *head);
+
static inline void get_huge_page_tail(struct page *page)
{
/*
@@ -996,6 +1015,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
+#define VM_FAULT_NOHUGE 0x0800 /* ->fault_huge, no huge page available .*/
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
@@ -1161,6 +1181,14 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
void account_page_dirtied(struct page *page, struct address_space *mapping);
void account_page_writeback(struct page *page);
int set_page_dirty(struct page *page);
+
+/** Sets huge page dirty, this will lock all tails, head should be locked.
+ * Compound should be getted or frozen. Skips all pages that have no mapping
+ *
+ * @param head
+ * @return number of sucessfull set_page_dirty
+ */
+int set_page_dirty_huge(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7649722..7d2c09d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -296,7 +296,7 @@ struct vm_area_struct {
/* Function pointers to deal with this struct. */
const struct vm_operations_struct *vm_ops;
-
+
/* Information about our backing store: */
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
units, *not* PAGE_CACHE_SIZE */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1cdd62a..bc547cb 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -142,8 +142,17 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
+
+/** Adds remap for huge page, compound page must be getted or frozen.
+ */
+extern void page_add_file_rmap_huge(struct page *head);
+
void page_remove_rmap(struct page *);
+/** Removes rmap for huge page, compound page must be getted or frozen.
+ */
+void page_remove_rmap_huge(struct page *);
+
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e3b4c38..74d2e84 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2455,3 +2455,45 @@ void __vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_page_address(next->vm_mm, nstart);
}
}
+
+/** Bumps tail pages usage count. This function assumes you have getted compound
+ * or forozen compound.
+ */
+void get_page_tails_for_fmap(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+ VM_BUG_ON(compound_order(head) < 2);
+
+ get_page(head + 1);
+ /* We may use __first_page, because we getts compound at whole. */
+ for (page = head + 2; page->__first_page == head; page++) {
+ VM_BUG_ON(!atomic_read(&page->_count));
+ VM_BUG_ON(!page->mapping);
+ VM_BUG_ON(!PageTail(page));
+ get_page(page);
+ }
+}
+
+/** Decrease tail pages usage count.
+ * This function assumes you have getted compound or forozen compound.
+ */
+void put_page_tails_for_fmap(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+ VM_BUG_ON(compound_order(head) < 2);
+
+ put_page(head + 1);
+ /* We may use __first_page, because we getts compound at whole. */
+ for (page = head + 2; page->__first_page == head; page++) {
+ VM_BUG_ON(!atomic_read(&page->_count));
+ VM_BUG_ON(!page->mapping);
+ VM_BUG_ON(!PageTail(page));
+ put_page(page);
+ }
+}
diff --git a/mm/memory.c b/mm/memory.c
index a0ab73c..7427c9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3148,7 +3148,137 @@ oom:
return VM_FAULT_OOM;
}
-/*
+/** Level 0 check if it's possible to establish huge pmd in process address
+ * space.
+ */
+static int check_if_hugemapping_is_possible0(
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pgoff_t pgoff,
+ pmd_t pmdVal /* Keep pmd for THP for Pivate Mapping. */)
+{
+ if (vma->vm_ops) {
+ /* This is base chcek. */
+ if (!vma->vm_ops->fault_huge)
+ return 0;
+ } else {
+ return 0;
+ }
+
+ if (vma->vm_flags & VM_SHARED && !(vma->vm_flags & VM_NONLINEAR)) {
+ /* Check if VMA address is pmd aligned */
+ if ((address & ~PMD_MASK) != 0)
+ return 0;
+
+ /* Check if pgoff is huge page aligned */
+ /* XXX This should be exported as it's reused in defrag. */
+ if ((pgoff & ((1 << (PMD_SHIFT - PAGE_SHIFT)) - 1)) != 0)
+ return 0;
+
+ /* Check if huge pmd will fit inside VMA.
+ * pmd_address_end returns first byte after end, not last byte!
+ */
+ if (!(pmd_addr_end(address, (unsigned long) -1) <= vma->vm_end))
+ return 0;
+
+ /* WIP [Private THP], check if pmd is marked as do not make THP,
+ * e.g. because it has COWs. (COWs gives milk).
+ * We need add such flag because
+ */
+
+ /* Check if file has enaugh length - not needed if there is
+ * huge page in page cache, this implies file has enaugh lenght.
+ * TODO Think on above. If true make requirement for THP support
+ * in page cache (put in documentation).
+ * This may break some concepts that page cache may have not
+ * up to date huge page, too.
+ */
+ } else {
+ /* Anonymous VMA - not opcoded, yet. */
+ return 0;
+ }
+
+ /* All tests passed */
+ printk(KERN_INFO "Chk - All passed");
+ return 1;
+}
+
+
+/** Commons function for performing faulting with support for huge pages.
+ * This method is designed to be facade-ed, by others.
+ *
+ * TODO Still need to consider locking order, to prevent dead locks...
+ * it's looks like better will be compound_lock -> page_lock
+ *
+ * @param page loaded head page, locked iff compound_lock, getted
+ *
+ * @return {@...e 0} on success
+ */
+static /*inline*/ int __huge_lock_check(
+ struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pud_t *pud,
+ pmd_t pmd,
+ pgoff_t pgoff,
+ unsigned int flags,
+ struct page *head)
+{
+ struct page *workPage;
+ unsigned long workAddress;
+ unsigned int processedPages;
+
+ int result = 0;
+
+ VM_BUG_ON(!check_if_hugemapping_is_possible0(vma, address, pgoff,
+ pmd));
+ VM_BUG_ON(atomic_read(&head->_count) <= 2);
+ VM_BUG_ON(!PageHead(head));
+
+ /* TODO [Documentation] expose below rules, from code.
+ *
+ * XXX Is it possible to with tests in loop to map not uptodate pages?
+ *
+ * It's looks like that with following designe we require that removing
+ * page uptodate flag, for compound pages, may require compound lock
+ * or something else.
+ */
+
+ /* Check if tail pages are uptodate, this should not happen,
+ * as we have compound_lock, but I can't guarantee and linear ordered.
+ */
+ processedPages = 0;
+ workAddress = address;
+ /** XXX [Performance] compound_head is rather slow make new macro, when
+ * we have compound page getted.
+ */
+ for (workPage = head; compound_head(workPage) == head; workPage++) {
+ if (!PageUptodate(workPage)
+ || !workPage->mapping
+ || (workPage->index - processedPages != pgoff)) {
+ result = -EINVAL;
+ goto exit_processing;
+ }
+ /* We don't check ptes, because we have shared mapping
+ * so all ptes should be (or could be in future) same, meaning
+ * mainly protection flags. This check will be required for
+ * private mapping.
+ */
+ processedPages++;
+ workAddress += PAGE_SIZE;
+ }
+ if (processedPages != (1 << (PMD_SHIFT - PAGE_SHIFT))) {
+ /* Not enaugh processed pages, why? */
+ return processedPages + 1;
+ }
+
+exit_processing:
+ printk("Processed %d", processedPages);
+
+ return result;
+}
+
+/**
* __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
@@ -3160,28 +3290,45 @@ oom:
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte neither mapped nor locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * This method shares same concepts for single and huge pages.
+ *
+ * @param pud pud entry, if NULL method operates in single page mode, otherwise
+ * operates in huge page mode.
*/
-static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static inline int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pud_t *pud, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags,
+ pmd_t orig_pmd, pte_t orig_pte)
{
pte_t *page_table;
+ pmd_t *huge_table;
+
+ pte_t entry;
+ pmd_t hentry;
+
spinlock_t *ptl;
struct page *page;
struct page *cow_page;
- pte_t entry;
+
int anon = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
+ const struct vm_operations_struct *vm_ops = vma->vm_ops;
int ret;
int page_mkwrite = 0;
+ VM_BUG_ON((!!pmd) == (!!pud));
+
/*
* If we do COW later, allocate page befor taking lock_page()
* on the file cache page. This will reduce lock holding time.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-
+ if (pud) {
+ /* Privte mapping write not supported yet. */
+ BUG();
+ }
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -3196,14 +3343,20 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else
cow_page = NULL;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.virtual_address = (void __user *)
+ (address & (pud ? HPAGE_MASK : PAGE_MASK));
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
- ret = vma->vm_ops->fault(vma, &vmf);
+ /** XXX Tails should be getted to. */
+ if (pud)
+ ret = vm_ops->fault_huge(vma, &vmf);
+ else
+ ret = vm_ops->fault(vma, &vmf);
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
- VM_FAULT_RETRY)))
+ VM_FAULT_RETRY | VM_FAULT_NOHUGE)))
goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
@@ -3213,21 +3366,36 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto uncharge_out;
}
- /*
- * For consistency in subsequent calls, make the faulted page always
- * locked.
+ /* For consistency in subsequent calls, make the faulted page
+ * always locked.
*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf.page);
else
VM_BUG_ON(!PageLocked(vmf.page));
+ page = vmf.page;
+ if (pud) {
+ /* Check consystency of page, if it is applicable for huge
+ * mapping.
+ */
+ if (__huge_lock_check(mm, vma, address, pud, orig_pmd, pgoff,
+ flags, vmf.page)) {
+ unlock_page(page);
+ goto unwritable_page;
+ }
+ }
+
/*
* Should we do an early C-O-W break?
*/
- page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
+ if (pud) {
+ /* Private cowing not supported yet for huge. */
+ BUG();
+ }
+
page = cow_page;
anon = 1;
copy_user_highpage(page, vmf.page, address, vma);
@@ -3238,89 +3406,156 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* address space wants to know that the page is about
* to become writable
*/
- if (vma->vm_ops->page_mkwrite) {
+ if ((!pud && vm_ops->page_mkwrite) ||
+ (pud && vm_ops->page_mkwrite_huge)) {
int tmp;
-
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+ tmp = vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+ if (pud)
+ BUG();
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
- unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
- page_mkwrite = 1;
+ page_mkwrite = 1 << (PMD_SHIFT - PAGE_SHIFT);
}
}
}
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /*
- * This silly early PAGE_DIRTY setting removes a race
- * due to the bad i386 page protection. But it's valid
- * for other architectures too.
- *
- * Note that if FAULT_FLAG_WRITE is set, we either now have
- * an exclusive copy of the page, or this is a shared mapping,
- * so we can make it writable and dirty to avoid having to
- * handle that later.
+ /* Following if is almost same for pud and not pud just, specified
+ * methods changed. Keep it as far as possi ble synchronized
*/
- /* Only go through if we didn't race with anybody else... */
- if (likely(pte_same(*page_table, orig_pte))) {
- flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (flags & FAULT_FLAG_WRITE)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (anon) {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- } else {
- inc_mm_counter_fast(mm, MM_FILEPAGES);
- page_add_file_rmap(page);
+ if (pud) {
+ huge_table = pmd_offset(pud, address);
+ /* During allocation of pte pte_alloc uses, mm's page table lock
+ * it is not best solution, but we reuse it here.
+ */
+ ptl = &mm->page_table_lock;
+ spin_lock(ptl);
+ if (likely(pmd_same(*huge_table, orig_pmd))) {
+ flush_icache_page(vma, page);/* TODO Arch specific? */
+ hentry = mk_pmd(page, vma->vm_page_prot);
+ hentry = pmd_mkhuge(hentry);
+
if (flags & FAULT_FLAG_WRITE) {
- dirty_page = page;
- get_page(dirty_page);
+ hentry = pmd_mkdirty(hentry);
+ /* TODO make it pmd_maybe_mkwrite*/
+ if (likely(vma->vm_flags & VM_WRITE))
+ hentry = pmd_mkwrite(hentry);
}
- }
- set_pte_at(mm, address, page_table, entry);
+ if (anon) {
+ BUG();
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ /* TODO Inc of huge pages counter...*/
+ add_mm_counter_fast(mm, MM_FILEPAGES,
+ HPAGE_PMD_NR);
+ page_add_file_rmap_huge(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ get_page_tails_for_fmap(dirty_page);
+ }
+ }
+ set_pmd_at(mm, address, huge_table, hentry);
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, page_table);
+ /* no need to invalidate: a not-present page won't be
+ * cached */
+ update_mmu_cache(vma, address, page_table);
+ } else {
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
+ }
+ spin_unlock(ptl);
} else {
- if (cow_page)
- mem_cgroup_uncharge_page(cow_page);
- if (anon)
- page_cache_release(page);
- else
- anon = 1; /* no anon but release faulted_page */
- }
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ /*
+ * This silly early PAGE_DIRTY setting removes a race
+ * due to the bad i386 page protection. But it's valid
+ * for other architectures too.
+ *
+ * Note that if FAULT_FLAG_WRITE is set, we either now have
+ * an exclusive copy of the page, or this is a shared mapping,
+ * so we can make it writable and dirty to avoid having to
+ * handle that later.
+ */
+ /* Only go through if we didn't race with anybody else... */
+ if (likely(pte_same(*page_table, orig_pte))) {
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (flags & FAULT_FLAG_WRITE)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (anon) {
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
+ page_add_file_rmap(page);
+ if (flags & FAULT_FLAG_WRITE) {
+ dirty_page = page;
+ get_page(dirty_page);
+ }
+ }
+ set_pte_at(mm, address, page_table, entry);
- pte_unmap_unlock(page_table, ptl);
+ /* no need to invalidate: a not-present page won't be
+ * cached */
+ update_mmu_cache(vma, address, page_table);
+ } else {
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
+ if (anon)
+ page_cache_release(page);
+ else
+ anon = 1; /* no anon but release faulted_page */
+ }
+ pte_unmap_unlock(page_table, ptl);
+ }
if (dirty_page) {
struct address_space *mapping = page->mapping;
- if (set_page_dirty(dirty_page))
- page_mkwrite = 1;
- unlock_page(dirty_page);
+ if (pud) {
+ int dirtied;
+ dirtied = set_page_dirty_huge(dirty_page);
+ unlock_page(dirty_page);
+ if (dirtied)
+ page_mkwrite = dirtied;
+ } else {
+ if (set_page_dirty(dirty_page))
+ page_mkwrite = 1;
+ unlock_page(dirty_page);
+ }
+
+ if (pud) {
+ put_page_tails_for_fmap(dirty_page);
+ compound_put(page);
+ }
+
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
- balance_dirty_pages_ratelimited(mapping);
+ balance_dirty_pages_ratelimited_nr(mapping,
+ page_mkwrite);
}
/* file_update_time outside page_lock */
@@ -3328,6 +3563,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
+ if (pud)
+ compound_put(page);
if (anon)
page_cache_release(vmf.page);
}
@@ -3335,6 +3572,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
unwritable_page:
+ if (pud) {
+ compound_put(page);
+ put_page_tails_for_fmap(page);
+ }
page_cache_release(page);
return ret;
uncharge_out:
@@ -3346,6 +3587,33 @@ uncharge_out:
return ret;
}
+/** Facade for {@...k __do_fault} to fault "huge" pages.
+ * GCC will strip unneeded code basing on parameters passed.
+ */
+static int __do_fault_huge(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pud_t *pud,
+ pgoff_t pgoff, unsigned int flags,
+ pmd_t orig_pmd)
+{
+ pte_t pte_any;
+ return __do_fault(
+ mm, vma, address, pud, NULL, pgoff, flags, orig_pmd, pte_any);
+}
+
+/** Facade for {@...k __do_fault} to fault "normal", pte level pages.
+ * GCC will strip unneeded code basing on parameters passed.
+ */
+static int __do_fault_normal(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ pmd_t pmd_any;
+ return __do_fault(
+ mm, vma, address, NULL, pmd, pgoff, flags, pmd_any, orig_pte);
+}
+
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
@@ -3354,7 +3622,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3386,7 +3654,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
pgoff = pte_to_pgoff(orig_pte);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ return __do_fault_normal(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3455,6 +3723,105 @@ unlock:
return 0;
}
+/** Handles fault on pde level.*/
+int handle_pmd_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pud_t *pud, pmd_t *pmd, unsigned int flags)
+{
+ pte_t *pte;
+ pgoff_t pgoff;
+ pmd_t pmdVal;
+ int faultResult;
+
+ if (!vma->vm_file) {
+ /* Anonymous THP handling */
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops) {
+ return do_huge_pmd_anonymous_page(mm, vma,
+ address, pmd, flags);
+ }
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma,
+ address, pmd, orig_pmd);
+ return 0;
+ }
+ goto handle_pte_level;
+ }
+ }
+ /***************************
+ * Page cache THP handling *
+ ***************************/
+ pmdVal = *pmd;
+ if (pmd_present(pmdVal) && !pmd_trans_huge(pmdVal))
+ goto handle_pte_level;
+
+ if ((address & HPAGE_MASK) < vma->vm_start)
+ goto handle_pte_level;
+
+ /* Even if possible we currently support only for SHARED VMA.
+ *
+ * We support this only for shmem fs, but everyone is encorege
+ * to add few simple methods and test it for other file systems.
+ * Notes, warrnings etc are always welcome.
+ */
+ if (!(vma->vm_flags & VM_SHARED))
+ goto handle_pte_level;
+
+ /* Handle fault of possible vma with huge page. */
+ pgoff = (((address & HPAGE_MASK) - vma->vm_start) >> PAGE_SHIFT)
+ + vma->vm_pgoff;
+
+ if (!pmd_present(pmdVal)) {
+ /* No page at all. */
+ if (!check_if_hugemapping_is_possible0(vma, address, pgoff,
+ pmdVal))
+ goto handle_pte_level;
+ } else {
+ /* TODO Jump to make page writable. If not for regular
+ * filesystems, full fault path will be reused.
+ */
+ }
+
+ faultResult = __do_fault_huge(mm, vma, address, pud, pgoff, flags,
+ pmdVal);
+ if (!(faultResult & (VM_FAULT_ERROR | VM_FAULT_NOHUGE))) {
+ printk(KERN_INFO "Setted huge pmd");
+ return faultResult;
+ }
+
+handle_pte_level:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ return VM_FAULT_OOM;
+ /* Page cache THP uses mm->page_table_lock to check if pmd is still
+ * none just before setting ne huge pmd, is __pte_alloc suceeded
+ * then pmd may be huge or "normal" with ptes page.
+ *
+ * if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*/
@@ -3464,7 +3831,6 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- pte_t *pte;
__set_current_state(TASK_RUNNING);
@@ -3484,42 +3850,8 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- if (!vma->vm_ops)
- return do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
- } else {
- pmd_t orig_pmd = *pmd;
- barrier();
- if (pmd_trans_huge(orig_pmd)) {
- if (flags & FAULT_FLAG_WRITE &&
- !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd))
- return do_huge_pmd_wp_page(mm, vma, address,
- pmd, orig_pmd);
- return 0;
- }
- }
- /*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
- * run pte_offset_map on the pmd, if an huge pmd could
- * materialize from under us from a different thread.
- */
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
- return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
- return 0;
- /*
- * A regular pmd is established and it can't morph into a huge pmd
- * from under us anymore at this point because we hold the mmap_sem
- * read mode and khugepaged takes it in write mode. So now it's
- * safe to run pte_offset_map().
- */
- pte = pte_offset_map(pmd, address);
-
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ return handle_pmd_fault(mm, vma, address, pud, pmd, flags);
}
#ifndef __PAGETABLE_PUD_FOLDED
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba70..ff32b5d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2072,6 +2072,37 @@ int set_page_dirty(struct page *page)
}
EXPORT_SYMBOL(set_page_dirty);
+int set_page_dirty_huge(struct page *head)
+{
+ struct page *work;
+ int result = 0;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(!PageLocked(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ if (head->mapping)
+ result += set_page_dirty(head);
+ else
+ BUG_ON(!PageSplitDeque(head));
+
+ for (work = head+1; compound_head(work) == head; work++) {
+ VM_BUG_ON(page_has_private(work));
+ VM_BUG_ON(page_has_buffers(work));
+
+ lock_page(work);
+ if (work->mapping) {
+ result += set_page_dirty(work);
+ } else {
+ /* Bug if there is no mapping and split is not
+ * dequeued.
+ */
+ BUG_ON(!PageSplitDeque(head));
+ }
+ unlock_page(work);
+ }
+ return result;
+}
/*
* set_page_dirty() is racy if the caller has no reference against
* page->mapping->host, and if the page is unlocked. This is because another
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e0..11f54e0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1157,6 +1157,21 @@ void page_add_file_rmap(struct page *page)
}
}
+void page_add_file_rmap_huge(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ page_add_file_rmap(head);
+ page_add_file_rmap(head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++)
+ page_add_file_rmap(page);
+ }
+}
+
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
@@ -1207,6 +1222,20 @@ void page_remove_rmap(struct page *page)
*/
}
+void page_remove_rmap_huge(struct page *head)
+{
+ struct page *page;
+
+ VM_BUG_ON(!PageHead(head));
+ VM_BUG_ON(atomic_read(&head[2]._compound_usage) == 1);
+
+ page_remove_rmap(head);
+ page_remove_rmap(head + 1);
+ if (likely(compound_order(head) > 1)) {
+ for (page = head+2; page->__first_page == head; page++)
+ page_remove_rmap(page);
+ }
+}
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
* repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
--
1.7.3.4
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists