linux-kernel - Re: [PATCH v4 08/12] dax: remove dax_pmd

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161003095646.GN6457@quack2.suse.cz>
Date:   Mon, 3 Oct 2016 11:56:46 +0200
From:   Jan Kara <jack@...e.cz>
To:     Ross Zwisler <ross.zwisler@...ux.intel.com>
Cc:     linux-kernel@...r.kernel.org, Theodore Ts'o <tytso@....edu>,
        Alexander Viro <viro@...iv.linux.org.uk>,
        Andreas Dilger <adilger.kernel@...ger.ca>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Christoph Hellwig <hch@....de>,
        Dan Williams <dan.j.williams@...el.com>,
        Dave Chinner <david@...morbit.com>, Jan Kara <jack@...e.com>,
        Matthew Wilcox <mawilcox@...rosoft.com>,
        linux-ext4@...r.kernel.org, linux-fsdevel@...r.kernel.org,
        linux-mm@...ck.org, linux-nvdimm@...ts.01.org,
        linux-xfs@...r.kernel.org
Subject: Re: [PATCH v4 08/12] dax: remove dax_pmd_fault()

On Thu 29-09-16 16:49:26, Ross Zwisler wrote:
> dax_pmd_fault() is the old struct buffer_head + get_block_t based 2 MiB DAX
> fault handler.  This fault handler has been disabled for several kernel
> releases, and support for PMDs will be reintroduced using the struct iomap
> interface instead.
> 
> Signed-off-by: Ross Zwisler <ross.zwisler@...ux.intel.com>

Looks good. You can add:

Reviewed-by: Jan Kara <jack@...e.cz>

								Honza

> ---
>  fs/dax.c            | 213 ----------------------------------------------------
>  include/linux/dax.h |   6 +-
>  2 files changed, 1 insertion(+), 218 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 406feea..b5e7b13 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -909,219 +909,6 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
>  }
>  EXPORT_SYMBOL_GPL(dax_fault);
>  
> -#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
> -/*
> - * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
> - * more often than one might expect in the below function.
> - */
> -#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
> -
> -static void __dax_dbg(struct buffer_head *bh, unsigned long address,
> -		const char *reason, const char *fn)
> -{
> -	if (bh) {
> -		char bname[BDEVNAME_SIZE];
> -		bdevname(bh->b_bdev, bname);
> -		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
> -			"length %zd fallback: %s\n", fn, current->comm,
> -			address, bname, bh->b_state, (u64)bh->b_blocknr,
> -			bh->b_size, reason);
> -	} else {
> -		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
> -			current->comm, address, reason);
> -	}
> -}
> -
> -#define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
> -
> -/**
> - * dax_pmd_fault - handle a PMD fault on a DAX file
> - * @vma: The virtual memory area where the fault occurred
> - * @vmf: The description of the fault
> - * @get_block: The filesystem method used to translate file offsets to blocks
> - *
> - * When a page fault occurs, filesystems may call this helper in their
> - * pmd_fault handler for DAX files.
> - */
> -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> -		pmd_t *pmd, unsigned int flags, get_block_t get_block)
> -{
> -	struct file *file = vma->vm_file;
> -	struct address_space *mapping = file->f_mapping;
> -	struct inode *inode = mapping->host;
> -	struct buffer_head bh;
> -	unsigned blkbits = inode->i_blkbits;
> -	unsigned long pmd_addr = address & PMD_MASK;
> -	bool write = flags & FAULT_FLAG_WRITE;
> -	struct block_device *bdev;
> -	pgoff_t size, pgoff;
> -	sector_t block;
> -	int result = 0;
> -	bool alloc = false;
> -
> -	/* dax pmd mappings require pfn_t_devmap() */
> -	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
> -		return VM_FAULT_FALLBACK;
> -
> -	/* Fall back to PTEs if we're going to COW */
> -	if (write && !(vma->vm_flags & VM_SHARED)) {
> -		split_huge_pmd(vma, pmd, address);
> -		dax_pmd_dbg(NULL, address, "cow write");
> -		return VM_FAULT_FALLBACK;
> -	}
> -	/* If the PMD would extend outside the VMA */
> -	if (pmd_addr < vma->vm_start) {
> -		dax_pmd_dbg(NULL, address, "vma start unaligned");
> -		return VM_FAULT_FALLBACK;
> -	}
> -	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
> -		dax_pmd_dbg(NULL, address, "vma end unaligned");
> -		return VM_FAULT_FALLBACK;
> -	}
> -
> -	pgoff = linear_page_index(vma, pmd_addr);
> -	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> -	if (pgoff >= size)
> -		return VM_FAULT_SIGBUS;
> -	/* If the PMD would cover blocks out of the file */
> -	if ((pgoff | PG_PMD_COLOUR) >= size) {
> -		dax_pmd_dbg(NULL, address,
> -				"offset + huge page size > file size");
> -		return VM_FAULT_FALLBACK;
> -	}
> -
> -	memset(&bh, 0, sizeof(bh));
> -	bh.b_bdev = inode->i_sb->s_bdev;
> -	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
> -
> -	bh.b_size = PMD_SIZE;
> -
> -	if (get_block(inode, block, &bh, 0) != 0)
> -		return VM_FAULT_SIGBUS;
> -
> -	if (!buffer_mapped(&bh) && write) {
> -		if (get_block(inode, block, &bh, 1) != 0)
> -			return VM_FAULT_SIGBUS;
> -		alloc = true;
> -		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
> -	}
> -
> -	bdev = bh.b_bdev;
> -
> -	if (bh.b_size < PMD_SIZE) {
> -		dax_pmd_dbg(&bh, address, "allocated block too small");
> -		return VM_FAULT_FALLBACK;
> -	}
> -
> -	/*
> -	 * If we allocated new storage, make sure no process has any
> -	 * zero pages covering this hole
> -	 */
> -	if (alloc) {
> -		loff_t lstart = pgoff << PAGE_SHIFT;
> -		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
> -
> -		truncate_pagecache_range(inode, lstart, lend);
> -	}
> -
> -	if (!write && !buffer_mapped(&bh)) {
> -		spinlock_t *ptl;
> -		pmd_t entry;
> -		struct page *zero_page = get_huge_zero_page();
> -
> -		if (unlikely(!zero_page)) {
> -			dax_pmd_dbg(&bh, address, "no zero page");
> -			goto fallback;
> -		}
> -
> -		ptl = pmd_lock(vma->vm_mm, pmd);
> -		if (!pmd_none(*pmd)) {
> -			spin_unlock(ptl);
> -			dax_pmd_dbg(&bh, address, "pmd already present");
> -			goto fallback;
> -		}
> -
> -		dev_dbg(part_to_dev(bdev->bd_part),
> -				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
> -				__func__, current->comm, address,
> -				(unsigned long long) to_sector(&bh, inode));
> -
> -		entry = mk_pmd(zero_page, vma->vm_page_prot);
> -		entry = pmd_mkhuge(entry);
> -		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
> -		result = VM_FAULT_NOPAGE;
> -		spin_unlock(ptl);
> -	} else {
> -		struct blk_dax_ctl dax = {
> -			.sector = to_sector(&bh, inode),
> -			.size = PMD_SIZE,
> -		};
> -		long length = dax_map_atomic(bdev, &dax);
> -
> -		if (length < 0) {
> -			dax_pmd_dbg(&bh, address, "dax-error fallback");
> -			goto fallback;
> -		}
> -		if (length < PMD_SIZE) {
> -			dax_pmd_dbg(&bh, address, "dax-length too small");
> -			dax_unmap_atomic(bdev, &dax);
> -			goto fallback;
> -		}
> -		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
> -			dax_pmd_dbg(&bh, address, "pfn unaligned");
> -			dax_unmap_atomic(bdev, &dax);
> -			goto fallback;
> -		}
> -
> -		if (!pfn_t_devmap(dax.pfn)) {
> -			dax_unmap_atomic(bdev, &dax);
> -			dax_pmd_dbg(&bh, address, "pfn not in memmap");
> -			goto fallback;
> -		}
> -		dax_unmap_atomic(bdev, &dax);
> -
> -		/*
> -		 * For PTE faults we insert a radix tree entry for reads, and
> -		 * leave it clean.  Then on the first write we dirty the radix
> -		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
> -		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
> -		 * call into get_block() to translate the pgoff to a sector in
> -		 * order to be able to create a new radix tree entry.
> -		 *
> -		 * The PMD path doesn't have an equivalent to
> -		 * dax_pfn_mkwrite(), though, so for a read followed by a
> -		 * write we traverse all the way through dax_pmd_fault()
> -		 * twice.  This means we can just skip inserting a radix tree
> -		 * entry completely on the initial read and just wait until
> -		 * the write to insert a dirty entry.
> -		 */
> -		if (write) {
> -			/*
> -			 * We should insert radix-tree entry and dirty it here.
> -			 * For now this is broken...
> -			 */
> -		}
> -
> -		dev_dbg(part_to_dev(bdev->bd_part),
> -				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
> -				__func__, current->comm, address,
> -				pfn_t_to_pfn(dax.pfn),
> -				(unsigned long long) dax.sector);
> -		result |= vmf_insert_pfn_pmd(vma, address, pmd,
> -				dax.pfn, write);
> -	}
> -
> - out:
> -	return result;
> -
> - fallback:
> -	count_vm_event(THP_FAULT_FALLBACK);
> -	result = VM_FAULT_FALLBACK;
> -	goto out;
> -}
> -EXPORT_SYMBOL_GPL(dax_pmd_fault);
> -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> -
>  /**
>   * dax_pfn_mkwrite - handle first write to DAX page
>   * @vma: The virtual memory area where the fault occurred
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 4065601..d9a8350 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -48,16 +48,12 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
>  }
>  #endif
>  
> -#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
> -int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
> -				unsigned int flags, get_block_t);
> -#else
>  static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
>  				pmd_t *pmd, unsigned int flags, get_block_t gb)
>  {
>  	return VM_FAULT_FALLBACK;
>  }
> -#endif
> +
>  int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
>  #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
>  
> -- 
> 2.7.4
> 
> 
-- 
Jan Kara <jack@...e.com>
SUSE Labs, CR