[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20091214180419.GB23705@kroah.com>
Date: Mon, 14 Dec 2009 10:04:19 -0800
From: Greg KH <gregkh@...e.de>
To: linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
torvalds@...ux-foundation.org, stable@...nel.org, lwn@....net
Subject: Re: Linux 2.6.31.8
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 7be02ac..32c3da4 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
identified through its new major/minor numbers encoded
in devnum.
-noload Don't load the journal on mounting. Note that
- if the filesystem was not unmounted cleanly,
+norecovery Don't load the journal on mounting. Note that
+noload if the filesystem was not unmounted cleanly,
skipping the journal replay will lead to the
filesystem containing inconsistencies that can
lead to any number of problems.
@@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as
system crashes before the delayed allocation
blocks are forced to disk.
+discard Controls whether ext4 should issue discard/TRIM
+nodiscard(*) commands to the underlying block device when
+ blocks are freed. This is useful for SSD devices
+ and sparse/thinly-provisioned LUNs, but it is off
+ by default until sufficient testing has been done.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Makefile b/Makefile
index 2e405c8..6eefb00 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 31
-EXTRAVERSION = .7
+EXTRAVERSION = .8
NAME = Man-Eating Seals of Antiquity
# *DOCUMENTATION*
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5fd2da4..28a753d 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost)
EXPORT_SYMBOL(scsi_remove_host);
/**
- * scsi_add_host - add a scsi host
+ * scsi_add_host_with_dma - add a scsi host with dma device
* @shost: scsi host pointer to add
* @dev: a struct device of type scsi class
+ * @dma_dev: dma device for the host
+ *
+ * Note: You rarely need to worry about this unless you're in a
+ * virtualised host environments, so use the simpler scsi_add_host()
+ * function instead.
*
* Return value:
* 0 on success / != 0 for error
**/
-int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
+int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
+ struct device *dma_dev)
{
struct scsi_host_template *sht = shost->hostt;
int error = -EINVAL;
@@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
if (!shost->shost_gendev.parent)
shost->shost_gendev.parent = dev ? dev : &platform_bus;
+ shost->dma_dev = dma_dev;
error = device_add(&shost->shost_gendev);
if (error)
@@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
fail:
return error;
}
-EXPORT_SYMBOL(scsi_add_host);
+EXPORT_SYMBOL(scsi_add_host_with_dma);
static void scsi_host_dev_release(struct device *dev)
{
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index fc67cc6..cf13ff2 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
vport->els_tmofunc.function = lpfc_els_timeout;
vport->els_tmofunc.data = (unsigned long)vport;
- error = scsi_add_host(shost, dev);
+ error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev);
if (error)
goto out_put_shost;
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index 7dc3d18..7a838c8 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
int error = 0, i;
void *sense = NULL;
dma_addr_t sense_handle;
- u32 *sense_ptr;
+ unsigned long *sense_ptr;
memset(kbuff_arr, 0, sizeof(kbuff_arr));
@@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
}
sense_ptr =
- (u32 *) ((unsigned long)cmd->frame + ioc->sense_off);
+ (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off);
*sense_ptr = sense_handle;
}
@@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
* sense_ptr points to the location that has the user
* sense buffer address
*/
- sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw +
- ioc->sense_off);
+ sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw +
+ ioc->sense_off);
if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)),
sense, ioc->sense_len)) {
diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
index 0f87962..67e016d 100644
--- a/drivers/scsi/qla2xxx/qla_attr.c
+++ b/drivers/scsi/qla2xxx/qla_attr.c
@@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable)
fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN);
}
- if (scsi_add_host(vha->host, &fc_vport->dev)) {
+ if (scsi_add_host_with_dma(vha->host, &fc_vport->dev,
+ &ha->pdev->dev)) {
DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n",
vha->host_no, vha->vp_idx));
goto vport_create_failed_2;
diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c
index ac6855c..dcd1285 100644
--- a/drivers/scsi/scsi_lib_dma.c
+++ b/drivers/scsi/scsi_lib_dma.c
@@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd)
int nseg = 0;
if (scsi_sg_count(cmd)) {
- struct device *dev = cmd->device->host->shost_gendev.parent;
+ struct device *dev = cmd->device->host->dma_dev;
nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
cmd->sc_data_direction);
@@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map);
void scsi_dma_unmap(struct scsi_cmnd *cmd)
{
if (scsi_sg_count(cmd)) {
- struct device *dev = cmd->device->host->shost_gendev.parent;
+ struct device *dev = cmd->device->host->dma_dev;
dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
cmd->sc_data_direction);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d7..34bb797 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef..dc79b75 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
- sbi->s_gdb_count + 1);
+ ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db3..3b8321b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_HINT_TRY_GOAL 512
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED 1024
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 2048
struct ext4_allocation_request {
@@ -111,6 +113,33 @@ struct ext4_allocation_request {
unsigned int flags;
};
+#define DIO_AIO_UNWRITTEN 0x1
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished AIO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* sync IO or AIO */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+} ext4_io_end_t;
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+
/*
* Special inodes numbers
*/
@@ -251,7 +280,6 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -330,7 +360,16 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
-
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO 0x0010
+#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_DIO_CREATE_EXT)
/*
* ioctl commands
@@ -386,6 +425,9 @@ struct ext4_mount_options {
#endif
};
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -481,8 +523,8 @@ struct move_extent {
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- time->tv_sec >> 32 : 0) |
- ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
if (sizeof(time->tv_sec) > 4)
time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
<< 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -653,6 +695,18 @@ struct ext4_inode_info {
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
+
+ /* completed async DIOs that might need unwritten extents handling */
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -700,6 +754,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -841,6 +896,7 @@ struct ext4_sb_info {
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -923,6 +979,7 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -950,6 +1007,7 @@ struct ext4_sb_info {
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
@@ -960,6 +1018,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1591,15 +1653,42 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
ext4_group_t group)
{
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spin_lock(ext4_group_lock_ptr(sb, group));
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
}
static inline void ext4_unlock_group(struct super_block *sb,
@@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a8410..1c2db3f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0..6a94099 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- mark_buffer_dirty(bh);
+ if (inode && bh)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8c..1892a77 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/*
* Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
* This include super block, inode block, quota blocks and xattr blocks
*/
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -99,6 +100,9 @@
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
int
ext4_mark_iloc_dirty(handle_t *handle,
@@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
- if (handle == EXT4_NOJOURNAL_HANDLE)
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
@@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
return 0;
}
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb4..24fb20b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
err = ext4_journal_extend(handle, needed);
if (err <= 0)
return err;
- return ext4_journal_restart(handle, needed);
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ /*
+ * We have dropped i_data_sem so someone might have cached again
+ * an extent we are going to truncate.
+ */
+ ext4_ext_invalidate_cache(inode);
+
+ return err;
}
/*
@@ -701,7 +710,7 @@ err:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
@@ -1563,7 +1572,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append %d block to %d:%d (from %llu)\n",
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
@@ -1694,7 +1704,8 @@ has_space:
merge:
/* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
while (block < last && block != EXT_MAX_BLOCK) {
num = last - block;
/* find extent for this block */
+ down_read(&EXT4_I(inode)->i_data_sem);
path = ext4_ext_find_extent(inode, block, path);
+ up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext_debug("free last %u blocks starting %llu\n", num, start);
for (i = 0; i < num; i++) {
bh = sb_find_get_block(inode->i_sb, start + i);
- ext4_forget(handle, 0, inode, bh, start + i);
+ ext4_forget(handle, metadata, inode, bh, start + i);
}
ext4_free_blocks(handle, inode, start, num, metadata);
} else if (from == le32_to_cpu(ex->ee_block)
@@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
- err = ext4_ext_journal_restart(handle, credits);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
goto out;
@@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
#define EXT4_EXT_ZERO_LEN 7
-
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path,
+ ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zero out the first half */
+ return allocated;
+ } else if (err)
+ goto fix_extent_len;
+out:
+ return err ? err : allocated;
+
+fix_extent_len:
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t iblock,
+ unsigned int max_blocks,
+ int flags)
+{
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex1 = NULL;
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+ * If the uninitialized extent begins at the same logical
+ * block where the write begins, and the write completely
+ * covers the extent, then we don't need to split it.
+ */
+ if ((iblock == ee_block) && (allocated <= max_blocks))
+ return allocated;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * for sanity, update the length of the ex2 extent before
+ * we insert ex3, if ex1 is NULL. This is to avoid temporary
+ * overlap of blocks.
+ */
+ if (!ex1 && allocated > max_blocks)
+ ex2->ee_len = cpu_to_le16(max_blocks);
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zeroed the full extent */
+ /* blocks available from iblock */
+ return allocated;
+
+ } else if (err)
+ goto fix_extent_len;
+ /*
+ * The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ /*
+ * update the extent length after successful insert of the
+ * split extent
+ */
+ orig_ex.ee_len = cpu_to_le16(ee_len -
+ ext4_ext_get_actual_len(ex3));
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ allocated = max_blocks;
+ }
+ /*
+ * If there was a change of depth as part of the
+ * insertion of ex3 above, we need to update the length
+ * of the ex1 extent again here
+ */
+ if (ex1 && ex1 != ex) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+ * uninitialised still.
+ */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ ext4_ext_mark_uninitialized(ex2);
+ if (ex2 != ex)
+ goto insert;
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ ext_debug("out here\n");
+ goto out;
+insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2743,6 +2940,7 @@ insert:
} else if (err)
goto fix_extent_len;
out:
+ ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
@@ -2753,7 +2951,151 @@ fix_extent_len:
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ struct ext4_extent_header *eh;
+ int depth;
+ int err = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /*
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if (ex > EXT_FIRST_EXTENT(eh)) {
+ /*
+ * To merge left, pass "ex - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ ex--;
+ }
+ }
+ /*
+ * Try to Merge towards right.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ }
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, struct buffer_head *bh_result,
+ ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+ inode->i_ino, (unsigned long long)iblock, max_blocks,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+ /* DIO get_block() before submit the IO, split the extent */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to convertion to written when IO is
+ * completed
+ */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ goto out;
+ }
+ /* async DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ set_buffer_unwritten(bh_result);
+ goto out1;
+ }
+
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
+map_out:
+ set_buffer_mapped(bh_result);
+out1:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
/*
* Block allocation/map/preallocation routine for extents based files
*
@@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
- goto out;
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- if (allocated > max_blocks)
- allocated = max_blocks;
- /*
- * We have blocks reserved already. We
- * return allocated blocks so that delalloc
- * won't do block reservation for us. But
- * the buffer head will be unmapped so that
- * a read from the block returns 0s.
- */
- set_buffer_unwritten(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
- goto out2;
- }
-
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
- err = ret;
- goto out2;
- } else
- allocated = ret;
- goto outnew;
+ ret = ext4_ext_handle_uninitialized_extents(handle,
+ inode, iblock, max_blocks, path,
+ flags, allocated, bh_result, newblock);
+ return ret;
}
}
@@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ /*
+ * io_end structure was created for every async
+ * direct IO write to the middle of the file.
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform convertion when IO is done.
+ */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |=
+ EXT4_STATE_DIO_UNWRITTEN;;
+ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
@@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
set_buffer_new(bh_result);
- /* Cache only when it is _not_ an uninitialized extent */
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ } else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > max_blocks)
allocated = max_blocks;
@@ -3171,6 +3514,64 @@ retry:
}
/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ ext4_lblk_t block;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ block = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - block;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, block, max_blocks);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+/*
* Callback function called for each extent to gather FIEMAP information.
*/
static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
@@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Walk the extent tree gathering extent information.
* ext4_ext_fiemap_cb will push extents back to user.
*/
- down_read(&EXT4_I(inode)->i_data_sem);
error = ext4_ext_walk_space(inode, start_blk, len_blks,
ext4_ext_fiemap_cb, fieinfo);
- up_read(&EXT4_I(inode)->i_data_sem);
}
return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf641..d6049e4 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,27 +44,37 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0;
+ int ret;
+ tid_t commit_tid;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync);
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ return ret;
+
+ if (!journal)
+ return simple_fsync(file, dentry, datasync);
+
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
-
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ if (ext4_should_journal_data(inode))
+ return ext4_force_commit(inode->i_sb);
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- ret = sync_inode(inode, &wbc);
- if (journal && (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
- }
-out:
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (jbd2_log_start_commit(journal, commit_tid))
+ jbd2_log_wait_commit(journal, commit_tid);
+ else if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b..38b2154 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ return ret;
}
/*
@@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
*
* Normally this function find the preferred place for block allocation,
* returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
+ ext4_fsblk_t goal;
+
/*
* XXX need to get goal block from mballoc's data structures
*/
- return ext4_find_near(inode, partial);
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
}
/**
@@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
+ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
@@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
+ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
if (*err && (target == blks)) {
/*
@@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- else
+ if (err)
goto cleanup;
set_buffer_new(bh_result);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
EXT4_I(inode)->i_reserved_meta_blocks;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return total;
+ return (total << inode->i_blkbits);
}
/*
* Calculate the number of metadata blocks need to reserve
@@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, sector_t logical,
- sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+ sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, "check_block_validity",
+ ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
(unsigned long long) phys, len);
- WARN_ON(1);
return -EIO;
}
return 0;
}
/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages)
+ break;
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system corruption",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
}
@@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system "
+ "corruption after allocation",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1516,7 +1612,7 @@ retry:
ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1776,11 +1872,11 @@ repeat:
if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ vfs_dq_release_reservation_block(inode, total);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
- vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page,
}
/*
- * Delayed allocation stuff
- */
-
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
-/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back
*
@@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0, nr_to_writebump = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
trace_ext4_da_writepages(inode, wbc);
@@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- /*
- * Make sure nr_to_write is >= sbi->s_mb_stream_request
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
- wbc->nr_to_write = sbi->s_mb_stream_request;
- }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
mpd.wbc = wbc;
mpd.inode = mapping->host;
@@ -2904,7 +3007,9 @@ retry:
out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
- wbc->nr_to_write -= nr_to_writebump;
+ if (wbc->nr_to_write > nr_to_writebump)
+ wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -2994,7 +3099,7 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -3341,6 +3452,364 @@ out:
return ret;
}
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write to
+ * to the middle of file (<i_size)
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ iput(io->inode);
+ kfree(io);
+}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != DIO_AIO_UNWRITTEN)
+ return ret;
+
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ int ret = 0;
+ int ret2 = 0;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ return ret;
+
+ dump_aio_dio_list(inode);
+ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_aio_dio_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && (EXT4_I(inode)->i_state &
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ }
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
+ int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext4_journal_get_write_access(handle, bh);
@@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
*p = 0;
tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
+ ext4_forget(handle, is_metadata, inode, tbh, nr);
}
}
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
+ ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
}
/**
@@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
}
ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (ei->i_disksize && inode->i_size == 0 &&
- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
@@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
+ iloc.bh = 0;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- EXT4_SB(sb)->s_gdb_count)) ||
- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
ext4_error(sb, __func__,
"bad extended attribute block %llu in inode #%lu",
ei->i_file_acl, inode->i_ino);
@@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
- if (ret) {
- brelse(bh);
+ if (ret)
goto bad_inode;
- }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
- brelse(bh);
ret = -EIO;
ext4_error(inode->i_sb, __func__,
"bogus i_mode (%o) for inode=%lu",
@@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_inode_blocks_set(handle, raw_inode, ei))
goto out_brelse;
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- /* clear the migrate flag in the raw_inode */
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_HURD))
raw_inode->i_file_acl_high =
@@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4684,19 +5174,40 @@ out_brelse:
*/
int ext4_write_inode(struct inode *inode, int wait)
{
+ int err;
+
if (current->flags & PF_MEMALLOC)
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
- if (!wait)
- return 0;
+ if (!wait)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
- return ext4_force_commit(inode->i_sb);
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+ if (wait)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
+ }
+ return err;
}
/*
@@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode)
handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- if (!ext4_handle_valid(current_handle)) {
- ext4_mark_inode_dirty(current_handle, inode);
- return;
- }
-
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle);
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
@@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
len = PAGE_CACHE_SIZE;
+ lock_page(page);
+ /*
+ * return if we have all the buffers mapped. This avoid
+ * the need to call write_begin/write_end which does a
+ * journal_start/journal_stop which can block and take
+ * long time
+ */
if (page_has_buffers(page)) {
- /* return if we have all the buffers mapped */
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped))
+ ext4_bh_unmapped)) {
+ unlock_page(page);
goto out_unlock;
+ }
}
+ unlock_page(page);
/*
* OK, we need to fill the hole... Do write_begin write_end
* to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9c..b63d193 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,32 +221,38 @@ setversion_out:
struct file *donor_filp;
int err;
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (copy_from_user(&me,
(struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
+ me.moved_len = 0;
donor_filp = fget(me.donor_fd);
if (!donor_filp)
return -EBADF;
- if (!capable(CAP_DAC_OVERRIDE)) {
- if ((current->real_cred->fsuid != inode->i_uid) ||
- !(inode->i_mode & S_IRUSR) ||
- !(donor_filp->f_dentry->d_inode->i_mode &
- S_IRUSR)) {
- fput(donor_filp);
- return -EACCES;
- }
+ if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto mext_out;
}
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto mext_out;
+
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
- fput(donor_filp);
+ mnt_drop_write(filp->f_path.mnt);
+ if (me.moved_len > 0)
+ file_remove_suid(donor_filp);
- if (!err)
- if (copy_to_user((struct move_extent *)arg,
- &me, sizeof(me)))
- return -EFAULT;
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ err = -EFAULT;
+mext_out:
+ fput(donor_filp);
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd25846..099fd47 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -908,6 +908,97 @@ out:
return err;
}
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret = 0;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug("init group %lu\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures we don't add group
+ * to this buddy cache via resize
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* groups mapped by the page is blocked
* till we are done with allocation
*/
+repeat_load_buddy:
down_read(e4b->alloc_semp);
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /* we need to check for group need init flag
+ * with alloc_semp held so that we can be sure
+ * that new blocks didn't get added to the group
+ * when we are loading the buddy cache
+ */
+ up_read(e4b->alloc_semp);
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ goto repeat_load_buddy;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
}
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
- int ret;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
- struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
-
- mb_debug("init group %lu\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- this_grp = ext4_get_group_info(sb, group);
- /*
- * This ensures we don't add group
- * to this buddy cache via resize
- */
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
- /*
- * somebody initialized the group
- * return without doing anything
- */
- ret = 0;
- goto err;
- }
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
-
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
- /*
- * If both the bitmap and buddy are in
- * the same page we don't need to force
- * init the buddy
- */
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
-err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
- return ret;
-}
-
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
bsbits = ac->ac_sb->s_blocksize_bits;
- /* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
+
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2015,27 +2032,6 @@ repeat:
if (grp->bb_free == 0)
continue;
- /*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
- */
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_init_group(sb, group);
- if (err)
- goto out;
- }
-
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
- if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
- struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
@@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freesgi;
}
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (ngroups -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
@@ -2655,7 +2633,6 @@ err_freebuddy:
while (i-- > 0)
kfree(ext4_get_group_info(sb, i));
i = num_meta_group_infos;
-err_freemeta:
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
- ext4_fsblk_t discard_block;
struct list_head *l, *ltmp;
list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
page_cache_release(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->group);
- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
- + entry->start_blk
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
- entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
-
+ if (test_opt(sb, DISCARD)) {
+ ext4_fsblk_t discard_block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ discard_block = (ext4_fsblk_t)entry->group *
+ EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(es->s_first_data_block);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+ }
kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
}
@@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
}
/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context. We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ int len;
+
+ if (pa && pa->pa_type == MB_INODE_PA) {
+ len = ac->ac_b_ex.fe_len;
+ pa->pa_free += len;
+ }
+
+}
+
+/*
* use blocks preallocated to inode
*/
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
continue;
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
/* found preallocated blocks, use them */
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- size = max(size, isize);
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4549,6 +4564,7 @@ repeat:
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
} else if (*errp) {
+ ext4_discard_allocated_blocks(ac);
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b..8646149 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
* So allocate a credit of 3. We may update
* quota (user and group).
*/
- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if EXT4_EXT_MIGRATE is cleared a block allocation
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode,
EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ 1);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block allocation
- * via mmap write to holes. If we have allocated new blocks we fail
- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
- * The flag is updated with i_data_sem held to prevent racing with
- * block allocation.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
-
+ unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9..9a573a6 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
#include "ext4_extents.h"
#include "ext4.h"
-#define get_ext_path(path, inode, block, ret) \
- do { \
- path = ext4_ext_find_extent(inode, block, path); \
- if (IS_ERR(path)) { \
- ret = PTR_ERR(path); \
- path = NULL; \
- } \
- } while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
/**
* copy_extent_status - Copy the extent's initialization status
@@ -60,12 +77,14 @@ static int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
+ struct ext4_extent_header *eh;
int ppos, leaf_ppos = path->p_depth;
ppos = leaf_ppos;
if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
/* leaf block */
*extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
return 0;
}
@@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
ext_block_hdr(path[cur_ppos+1].p_bh);
}
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
/* leaf block */
path[leaf_ppos].p_ext = *extent =
EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext_pblock(path[leaf_ppos].p_ext);
return 0;
}
}
@@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * mext_check_null_inode - NULL check for two inodes
*
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function)
{
- struct inode *first = orig_inode, *second = donor_inode;
-
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ ext4_error(inode2->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ ext4_error(inode1->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
}
-
- down_read(&EXT4_I(first)->i_data_sem);
- down_read(&EXT4_I(second)->i_data_sem);
+ return ret;
}
/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
*/
static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
}
down_write(&EXT4_I(first)->i_data_sem);
- down_write(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_up_read - Release two inodes' read semaphore
- *
- * @orig_inode: original inode structure to be released its lock first
- * @donor_inode: donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
- up_read(&EXT4_I(orig_inode)->i_data_sem);
- up_read(&EXT4_I(donor_inode)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
- * mext_double_up_write - Release two inodes' write semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (new_flag) {
- get_ext_path(orig_path, orig_inode, eblock, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext))
+ orig_path, new_ext, 0))
goto out;
}
if (end_flag) {
- get_ext_path(orig_path, orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext))
+ orig_path, end_ext, 0))
goto out;
}
out:
@@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* oext |-----------|
* new_ext |-------|
*/
- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
/*
* Case: new_ext is smaller than original extent
@@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
o_end, &start_ext, &new_ext, &end_ext);
+out:
return ret;
}
@@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
* @max_count: the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
*/
-static void
+static int
mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
ext4_lblk_t diff, orig_diff;
struct ext4_extent dext_old, oext_old;
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
dext_old = *tmp_dext;
oext_old = *tmp_oext;
@@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
copy_extent_status(&oext_old, tmp_dext);
copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
}
/**
@@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @donor_inode: donor inode
* @from: block offset of orig_inode
* @count: block count to be replaced
+ * @err: pointer to save return value
*
* Replace original inode extents and donor inode extents page by page.
* We implement this replacement in the following three steps:
@@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* 3. Change the block information of donor inode to point at the saved
* original inode blocks in the dummy extents.
*
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
*/
static int
mext_replace_branches(handle_t *handle, struct inode *orig_inode,
struct inode *donor_inode, ext4_lblk_t from,
- ext4_lblk_t count)
+ ext4_lblk_t count, int *err)
{
struct ext4_ext_path *orig_path = NULL;
struct ext4_ext_path *donor_path = NULL;
struct ext4_extent *oext, *dext;
struct ext4_extent tmp_dext, tmp_oext;
ext4_lblk_t orig_off = from, donor_off = from;
- int err = 0;
int depth;
int replaced_count = 0;
int dext_alen;
- mext_double_down_write(orig_inode, donor_inode);
+ /* Protect extent trees against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
/* Get the donor extent for the head */
- get_ext_path(donor_path, donor_inode, donor_off, err);
- if (donor_path == NULL)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
+ if (*err)
+ goto out;
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+ *err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ *err = -EIO;
+ goto out;
+ }
/* Set donor extent to orig extent */
- err = mext_leaf_block(handle, orig_inode,
+ *err = mext_leaf_block(handle, orig_inode,
orig_path, &tmp_dext, &orig_off);
- if (err < 0)
+ if (*err)
goto out;
/* Set orig extent to donor extent */
- err = mext_leaf_block(handle, donor_inode,
+ *err = mext_leaf_block(handle, donor_inode,
donor_path, &tmp_oext, &donor_off);
- if (err < 0)
+ if (*err)
goto out;
dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
- if (le32_to_cpu(oext->ee_block) +
- ext4_ext_get_actual_len(oext) <= orig_off) {
- err = 0;
- goto out;
- }
tmp_oext = *oext;
if (donor_path)
ext4_ext_drop_refs(donor_path);
- get_ext_path(donor_path, donor_inode,
- donor_off, err);
- if (donor_path == NULL)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
- if (le32_to_cpu(dext->ee_block) +
- ext4_ext_get_actual_len(dext) <= donor_off) {
- err = 0;
- goto out;
- }
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off,
- count - replaced_count);
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (*err)
+ goto out;
}
out:
@@ -720,8 +758,12 @@ out:
kfree(donor_path);
}
- mext_double_up_write(orig_inode, donor_inode);
- return err;
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ return replaced_count;
}
/**
@@ -733,16 +775,17 @@ out:
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @uninit: orig extent is uninitialized or not
+ * @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
* with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
- * on success, or a negative error value on failure.
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
*/
static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit)
+ int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
struct address_space *mapping = orig_inode->i_mapping;
@@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
- unsigned int tmp_data_len, data_len;
+ unsigned int tmp_data_size, data_size, replaced_size;
void *fsdata;
- int ret, i, jblocks;
+ int i, jblocks;
+ int err2 = 0;
+ int replaced_count = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
@@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
+ *err = PTR_ERR(handle);
+ return 0;
}
if (segment_eq(get_fs(), KERNEL_DS))
@@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
* Just swap data blocks between orig and donor.
*/
if (uninit) {
- ret = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page);
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
goto out2;
}
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
- /* Calculate data_len */
+ /* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
/* Replace the last block */
- tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
/*
- * If data_len equal zero, it shows data_len is multiples of
+ * If data_size equal zero, it shows data_size is multiples of
* blocksize. So we set appropriate value.
*/
- if (tmp_data_len == 0)
- tmp_data_len = blocksize;
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
- data_len = tmp_data_len +
+ data_size = tmp_data_size +
((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else {
- data_len = block_len_in_page << orig_inode->i_blkbits;
- }
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
+
+ replaced_size = data_size;
- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
&page, &fsdata);
- if (unlikely(ret < 0))
+ if (unlikely(*err < 0))
goto out;
if (!PageUptodate(page)) {
@@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
/* Release old bh and drop refs */
try_to_release_page(page, 0);
- ret = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page);
- if (ret < 0)
- goto out;
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+ if (err2) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto out;
+ }
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
- ret = ext4_get_block(orig_inode,
+ *err = ext4_get_block(orig_inode,
(sector_t)(orig_blk_offset + i), bh, 0);
- if (ret < 0)
+ if (*err < 0)
goto out;
if (bh->b_this_page != NULL)
bh = bh->b_this_page;
}
- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
page, fsdata);
page = NULL;
@@ -871,11 +916,15 @@ out:
if (PageLocked(page))
unlock_page(page);
page_cache_release(page);
+ ext4_journal_stop(handle);
}
out2:
ext4_journal_stop(handle);
- return ret < 0 ? ret : 0;
+ if (err2)
+ *err = err2;
+
+ return replaced_count;
}
/**
@@ -886,7 +935,6 @@ out2:
* @orig_start: logical start offset in block for orig
* @donor_start: logical start offset in block for donor
* @len: the number of blocks to be moved
- * @moved_len: moved block length
*
* Check the arguments of ext4_move_extents() whether the files can be
* exchanged with each other.
@@ -894,9 +942,13 @@ out2:
*/
static int
mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len, __u64 moved_len)
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
@@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
/* Ext4 move extent does not support swapfile */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
@@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if (moved_len) {
- ext4_debug("ext4 move extent: moved_len should be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
- return -EINVAL;
- }
-
if ((orig_start > MAX_DEFRAG_SIZE) ||
(donor_start > MAX_DEFRAG_SIZE) ||
(*len > MAX_DEFRAG_SIZE) ||
@@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode,
}
if (orig_inode->i_size > donor_inode->i_size) {
- if (orig_start >= donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file size "
- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
- orig_start, donor_inode->i_size,
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > donor_inode->i_size) {
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file size [%lld]."
- "So adjust length from %llu to %lld "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
"[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_inode->i_size,
- *len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_inode->i_size - orig_start;
+ *len = donor_blocks - orig_start;
}
} else {
- if (orig_start >= orig_inode->i_size) {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file size "
- "[%lld] [inode:orig %lu, donor %lu]\n",
- orig_start, orig_inode->i_size,
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > orig_inode->i_size) {
+ if (orig_start + *len > orig_blocks) {
ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %lld. Because it should be "
- "less than original file size "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
"[ino:orig %lu, donor %lu]\n",
- *len, orig_inode->i_size - orig_start,
+ *len, orig_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_inode->i_size - orig_start;
+ *len = orig_blocks - orig_start;
}
}
@@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
- if (inode1)
- mutex_lock(&inode1->i_mutex);
- else if (inode2)
- mutex_lock(&inode2->i_mutex);
- return;
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
}
if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
+
+out:
+ return ret;
}
/**
@@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
if (inode1)
mutex_unlock(&inode1->i_mutex);
if (inode2 && inode2 != inode1)
mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
}
/**
@@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ int ret1, ret2, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
- /* protect orig and donor against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
- mext_double_down_read(orig_inode, donor_inode);
+ /* Protect extent tree against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
- donor_start, &len, *moved_len);
- mext_double_up_read(orig_inode, donor_inode);
- if (ret)
- goto out2;
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len);
+ if (ret1)
+ goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
block_end = block_start + len - 1;
if (file_end < block_end)
len -= block_end - file_end;
- get_ext_path(orig_path, orig_inode, block_start, ret);
- if (orig_path == NULL)
- goto out2;
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
/* Get path structure to check the hole */
- get_ext_path(holecheck_path, orig_inode, block_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
goto out;
depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL) {
- ret = -EINVAL;
- goto out;
- }
/*
- * Get proper extent whose ee_block is beyond block_start
- * if block_start was within the hole.
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
- }
- seq_start = block_start;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
/* No blocks within the specified range. */
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret = -EINVAL;
+ ret1 = -EINVAL;
goto out;
}
@@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
seq_start = le32_to_cpu(ext_cur->ee_block);
rest_blocks = seq_blocks;
- /* Discard preallocations of two inodes */
- down_write(&EXT4_I(orig_inode)->i_data_sem);
- ext4_discard_preallocations(orig_inode);
- up_write(&EXT4_I(orig_inode)->i_data_sem);
-
- down_write(&EXT4_I(donor_inode)->i_data_sem);
- ext4_discard_preallocations(donor_inode);
- up_write(&EXT4_I(donor_inode)->i_data_sem);
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret = move_extent_par_page(o_filp, donor_inode,
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit);
- if (ret < 0)
- goto out;
- orig_page_offset++;
+ block_len_in_page, uninit,
+ &ret1);
+
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- BUG_ON(*moved_len > len);
+ if (ret1 < 0)
+ break;
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ break;
+ }
+ orig_page_offset++;
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
if (rest_blocks > blocks_per_page)
@@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
+ double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret1 < 0)
+ break;
+
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- get_ext_path(holecheck_path, orig_inode,
- seq_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, seq_start, ret);
- if (orig_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
if (orig_path) {
ext4_ext_drop_refs(orig_path);
kfree(orig_path);
@@ -1307,14 +1406,13 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-out2:
- mext_inode_double_unlock(orig_inode, donor_inode);
-
- if (ret)
- return ret;
+ double_up_write_data_sem(orig_inode, donor_inode);
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
- /* All of the specified blocks must be exchanged in succeed */
- BUG_ON(*moved_len != len);
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013..9dcd686 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
- bh, offset)) {
- brelse(bh);
+ bh, offset))
return -EIO;
- }
- if (ext4_match(namelen, name, de)) {
- brelse(bh);
+ if (ext4_match(namelen, name, de))
return -EEXIST;
- }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_journal_get_write_access(handle, bh);
if (err) {
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return err;
}
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return 0;
}
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (!(de))
return retval;
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if(!bh)
return retval;
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
+ if (retval != -ENOSPC) {
+ brelse(bh);
return retval;
+ }
if (blocks == 1 && !dx_fallback &&
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto journal_error;
err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
+ if (err != -ENOSPC)
goto cleanup;
- }
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
goto cleanup;
journal_error:
@@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- if (!ext4_handle_valid(handle))
+ /* ext4_handle_valid() assumes a valid handle_t pointer */
+ if (handle && !ext4_handle_valid(handle))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
/*
@@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ EXT4_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351..96302cd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
if (IS_ERR(gdb = bclean(handle, sb, block))) {
- err = PTR_ERR(bh);
+ err = PTR_ERR(gdb);
goto exit_bh;
}
ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079..ed38f25 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "mballoc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
}
return jbd2_journal_start(journal, nblocks);
}
- /*
- * We're not journaling, return the appropriate indication.
- */
- current->journal_info = EXT4_NOJOURNAL_HANDLE;
- return current->journal_info;
+ return ext4_get_nojournal();
}
/*
@@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int rc;
if (!ext4_handle_valid(handle)) {
- /*
- * Do this here since we don't call jbd2_journal_stop() in
- * no-journal mode.
- */
- current->journal_info = NULL;
+ ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
@@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
ext4_commit_super(sb, 1);
- ext4_release_system_zone(sb);
- ext4_mb_release(sb);
- ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, __func__,
"Couldn't clean up the journal");
}
+
+ ext4_release_system_zone(sb);
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ ext4_xattr_put_super(sb);
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
return &ei->vfs_inode;
}
@@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NO_AUTO_DA_ALLOC))
seq_puts(seq, ",noauto_da_alloc");
+ if (test_opt(sb, DISCARD))
+ seq_puts(seq, ",discard");
+
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -1057,7 +1096,8 @@ enum {
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
- Opt_inode_readahead_blks, Opt_journal_ioprio
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
};
static const match_table_t tokens = {
@@ -1082,6 +1122,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
@@ -1123,6 +1164,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
};
@@ -1551,6 +1594,12 @@ set_qf_format:
else
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
+ case Opt_discard:
+ set_opt(sbi->s_mount_opt, DISCARD);
+ break;
+ case Opt_nodiscard:
+ clear_opt(sbi->s_mount_opt, DISCARD);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
@@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
size_t size;
int i;
- if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ if (groups_per_flex < 2) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
- ext4_free_inodes_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
- ext4_free_blks_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
- ext4_used_dirs_count(sb, gdp));
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_blks_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
return 1;
@@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
NULL,
};
@@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = {
.release = ext4_sb_release,
};
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ return 1;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int db_count;
unsigned int i;
int needs_recovery, has_huge_files;
- int features;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
- ~EXT4_FEATURE_INCOMPAT_SUPP));
- goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount RDWR because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (has_huge_files) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LBDAF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LBDAF");
- goto failed_mount;
- }
- }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ if ((ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+ (ext4_blocks_count(es) >
+ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
ext4_msg(sb, KERN_ERR, "filesystem"
- " too large to mount safely");
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = -EFBIG;
goto failed_mount;
}
@@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
/*
* set up enough so that it can read an inode
@@ -2781,6 +2856,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2893,6 +2974,8 @@ cantfind_ext4:
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
@@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
} else {
- int ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- ext4_msg(sb, KERN_WARNING, "couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x)",
- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
-
/*
* Make sure the group descriptor checksums
* are sane. If they aren't, refuse to remount r/w.
@@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
- ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c2..0257019 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
goal, NULL, &error);
if (error)
goto cleanup;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
@@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b..8cf902a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &new_jh, blocknr);
+ if (flags < 0) {
+ jbd2_journal_abort(journal, flags);
+ continue;
+ }
set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
wbuf[bufs++] = jh2bh(new_jh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb3..4b74149 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_log_start_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -361,6 +362,10 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
+ if (!tmp) {
+ jbd2_journal_put_journal_head(new_jh);
+ return -ENOMEM;
+ }
jbd_lock_bh_state(bh_in);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
@@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac7..a051270 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
__jbd2_log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
__jbd2_log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f1ea4a..d3e910b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info)
return info <= SEND_SIG_FORCED;
}
-/* True if we are on the alternate signal stack. */
-
+/*
+ * True if we are on the alternate signal stack.
+ */
static inline int on_sig_stack(unsigned long sp)
{
- return (sp - current->sas_ss_sp < current->sas_ss_size);
+#ifdef CONFIG_STACK_GROWSUP
+ return sp >= current->sas_ss_sp &&
+ sp - current->sas_ss_sp < current->sas_ss_size;
+#else
+ return sp > current->sas_ss_sp &&
+ sp - current->sas_ss_sp <= current->sas_ss_size;
+#endif
}
static inline int sas_ss_flags(unsigned long sp)
diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h
index 2cc8e8b..6856612 100644
--- a/include/scsi/osd_protocol.h
+++ b/include/scsi/osd_protocol.h
@@ -17,6 +17,7 @@
#define __OSD_PROTOCOL_H__
#include <linux/types.h>
+#include <linux/kernel.h>
#include <asm/unaligned.h>
#include <scsi/scsi.h>
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b62a097..6cc72e2 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -677,6 +677,12 @@ struct Scsi_Host {
void *shost_data;
/*
+ * Points to the physical bus device we'd use to do DMA
+ * Needed just in case we have virtual hosts.
+ */
+ struct device *dma_dev;
+
+ /*
* We should ensure that this is aligned, both for better performance
* and also because some compilers (m68k) don't automatically force
* alignment to a long boundary.
@@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
extern void scsi_flush_work(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
-extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *);
+extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
+ struct device *,
+ struct device *);
extern void scsi_scan_host(struct Scsi_Host *);
extern void scsi_rescan_device(struct device *);
extern void scsi_remove_host(struct Scsi_Host *);
@@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state);
extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *);
+static inline int __must_check scsi_add_host(struct Scsi_Host *host,
+ struct device *dev)
+{
+ return scsi_add_host_with_dma(host, dev, dev);
+}
+
static inline struct device *scsi_get_device(struct Scsi_Host *shost)
{
return shost->shost_gendev.parent;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 7d8b5bc..824979e 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -5,10 +5,12 @@
#define _TRACE_EXT4_H
#include <linux/writeback.h>
-#include "../../../fs/ext4/ext4.h"
-#include "../../../fs/ext4/mballoc.h"
#include <linux/tracepoint.h>
+struct ext4_allocation_context;
+struct ext4_allocation_request;
+struct ext4_prealloc_space;
+
TRACE_EVENT(ext4_free_inode,
TP_PROTO(struct inode *inode),
@@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages,
__field( char, for_reclaim )
__field( char, for_writepages )
__field( char, range_cyclic )
+ __field( pgoff_t, writeback_index )
),
TP_fast_assign(
@@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages,
__entry->for_reclaim = wbc->for_reclaim;
__entry->for_writepages = wbc->for_writepages;
__entry->range_cyclic = wbc->range_cyclic;
+ __entry->writeback_index = inode->i_mapping->writeback_index;
),
- TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
+ TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu",
+ jbd2_dev_to_name(__entry->dev),
+ (unsigned long) __entry->ino, __entry->nr_to_write,
__entry->pages_skipped, __entry->range_start,
__entry->range_end, __entry->nonblocking,
__entry->for_kupdate, __entry->for_reclaim,
- __entry->for_writepages, __entry->range_cyclic)
+ __entry->for_writepages, __entry->range_cyclic,
+ (unsigned long) __entry->writeback_index)
+);
+
+TRACE_EVENT(ext4_da_write_pages,
+ TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
+
+ TP_ARGS(inode, mpd),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( __u64, b_blocknr )
+ __field( __u32, b_size )
+ __field( __u32, b_state )
+ __field( unsigned long, first_page )
+ __field( int, io_done )
+ __field( int, pages_written )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->b_blocknr = mpd->b_blocknr;
+ __entry->b_size = mpd->b_size;
+ __entry->b_state = mpd->b_state;
+ __entry->first_page = mpd->first_page;
+ __entry->io_done = mpd->io_done;
+ __entry->pages_written = mpd->pages_written;
+ ),
+
+ TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
+ jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+ __entry->b_blocknr, __entry->b_size,
+ __entry->b_state, __entry->first_page,
+ __entry->io_done, __entry->pages_written)
);
TRACE_EVENT(ext4_da_writepages_result,
@@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result,
__field( char, encountered_congestion )
__field( char, more_io )
__field( char, no_nrwrite_index_update )
+ __field( pgoff_t, writeback_index )
),
TP_fast_assign(
@@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result,
__entry->encountered_congestion = wbc->encountered_congestion;
__entry->more_io = wbc->more_io;
__entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ __entry->writeback_index = inode->i_mapping->writeback_index;
),
- TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret,
+ TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
+ jbd2_dev_to_name(__entry->dev),
+ (unsigned long) __entry->ino, __entry->ret,
__entry->pages_written, __entry->pages_skipped,
__entry->encountered_congestion, __entry->more_io,
- __entry->no_nrwrite_index_update)
+ __entry->no_nrwrite_index_update,
+ (unsigned long) __entry->writeback_index)
);
TRACE_EVENT(ext4_da_write_begin,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists