lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Date:	Mon, 2 Apr 2007 15:37:28 -0600
From:	Andreas Dilger <adilger@...sterfs.com>
To:	Eric Sandeen <esandeen@...hat.com>, linux-ext4@...r.kernel.org
Subject: [PATCH] uninitialized groups kernel patch

This patch (against 2.6.16.27-0.9 = latest SLES10 kernel) adds
support for the uninit_groups feature to the kernel.  This speeds up
e2fsck on most filesystems by a factor of 2-10x in our tests.  It might
speed up the kernel by a tiny fraction because we don't have to read
uninitialized bitmaps from disk, but not a huge amount.  We already skip
reading inode table blocks from disk if they are unused, since 2.6.12 or
so (and that was a major speedup for file creation).

A separate patch will be sent for the e2fsprogs part of the work.

We keep a high water mark of used inodes for each group to improve e2fsck
time.  Block and inode bitmaps can be uninitialized on disk via a flag in
the group descriptor to avoid reading or scanning them at e2fsck time.

A checksum of each group descriptor is used to ensure that corruption in
the group descriptor's bit flags does not cause incorrect operation.  If the
checksum is incorrect, then either we refuse to mount (e2fsck will fix this
at the beginning of pass1), or we mark the group as unusable (0 free inodes
and 0 free blocks) so the kernel won't try to allocate from there (it may
be that the UNINIT flags were "cleared" by corruption and we would further
corrupt the filesystem by using invalid bitmaps).

To enable on a new filesystem:
- mke2fs -O uninit_groups /dev/XXX

To enable on a new filesystem and not zero out the inode table (faster, and
good for loopback filesystems, not so great for real world usage until TODO
is complete):
- mke2fs -O lazy_bg,uninit_groups /dev/XXX

To enable on existing filesystem (set bg_itable_unused and/or UNINIT flags
				  on groups, set csum):
- tune2fs -O uninit_groups /dev/XXX
- e2fsck -fy /dev/XXX  (expect a lot of "checksum incorrect" messages)


To disable this feature (clear UNINIT flags, csum):
- tune2fs -O ^uninit_groups /dev/XXX
- e2fsck -fy /dev/XXX  (expect a lot of "checksum incorrect" messages)


TODO:
- have the kernel launch a thread at mount time to zero out uninitialized
  inode tables (i.e. no BG_INODE_ZEROED flag set) so that e2fsck doesn't
  get confused on a newly-formatted filesystem
- handle larger group descriptor in ext3_group_desc_csum() (this isn't in
  the SLES10 kernel that I'm doing build/test on)
- use ext3_init_*_bitmap() functions in resize.c
- have UNINIT flags affect inode & block placement policy to avoid gratuitous
  initialization of all group bitmaps

Review & comments welcome.

Signed-off-by: Andreas Dilger <adilger@...sterfs.com>
Signed-off-by: Girish Shilamkar <girish@...sterfs.com>
Signed-off-by: Kalpak Shah <kalpak@...sterfs.com>

====================== ext3-uninit-2.6.16-sles10.patch =================

Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.16.27-0.9-full.orig/include/linux/ext3_fs.h	2007-03-28 18:20:16.000000000 +0400
+++ linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h	2007-03-28 18:30:06.000000000 +0400
@@ -153,16 +153,22 @@ struct ext3_allocation_request {
  */
 struct ext3_group_desc
 {
-	__le32	bg_block_bitmap;		/* Blocks bitmap block */
-	__le32	bg_inode_bitmap;		/* Inodes bitmap block */
+	__le32	bg_block_bitmap;	/* Blocks bitmap block */
+	__le32	bg_inode_bitmap;	/* Inodes bitmap block */
 	__le32	bg_inode_table;		/* Inodes table block */
 	__le16	bg_free_blocks_count;	/* Free blocks count */
 	__le16	bg_free_inodes_count;	/* Free inodes count */
 	__le16	bg_used_dirs_count;	/* Directories count */
-	__u16	bg_pad;
-	__le32	bg_reserved[3];
+	__le16  bg_flags;		/* EXT3_BG_flags (UNINIT, etc) */
+	__le32	bg_reserved[2];		/* Likely block/inode bitmap checksum */
+	__le16	bg_itable_unused;	/* Unused inodes count */
+	__le16	bg_checksum;		/* crc16(sb_uuid+group+desc) */
 };
 
+#define EXT3_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+#define EXT3_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+#define EXT3_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+
 /*
  * Macro-instructions used to manage group descriptors
  */
@@ -590,6 +596,7 @@ static inline struct ext3_inode_info *EX
 #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
 #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
 #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
 
 #define EXT3_FEATURE_INCOMPAT_COMPRESSION	0x0001
@@ -606,6 +613,7 @@ static inline struct ext3_inode_info *EX
 					 EXT3_FEATURE_INCOMPAT_EXTENTS)
 #define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
 					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \
 					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
 
Index: linux-2.6.16.27-0.9-full/fs/ext3/resize.c
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/resize.c	2007-03-13 02:56:52.000000000 +0300
+++ linux-2.6.16.27-0.9-full/fs/ext3/resize.c	2007-03-28 18:30:06.000000000 +0400
@@ -19,6 +19,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 
+#include "group.h"
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
@@ -818,6 +819,7 @@ int ext3_group_add(struct super_block *s
 	gdp->bg_inode_table = cpu_to_le32(input->inode_table);
 	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
 	gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb));
+	gdp->bg_checksum = ext3_group_desc_csum(sbi, input->group, gdp);
 
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/super.c	2007-03-28 18:25:51.000000000 +0400
+++ linux-2.6.16.27-0.9-full/fs/ext3/super.c	2007-03-28 18:30:06.000000000 +0400
@@ -42,6 +42,7 @@
 #include "xattr.h"
 #include "acl.h"
 #include "namei.h"
+#include "group.h"
 
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
@@ -1221,6 +1222,90 @@ static int ext3_setup_super(struct super
 	return res;
 }
 
+#if !defined(CONFIG_CRC16) && !defined(CONFIG_CRC16_MODULE)
+/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
+__u16 const crc16_table[256] = {
+	0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
+	0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
+	0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
+	0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
+	0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
+	0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
+	0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
+	0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
+	0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
+	0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
+	0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
+	0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
+	0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
+	0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
+	0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
+	0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
+	0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
+	0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
+	0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
+	0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
+	0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
+	0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
+	0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
+	0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
+	0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
+	0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
+	0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
+	0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
+	0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
+	0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
+	0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
+	0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
+};
+
+static inline __u16 crc16_byte(__u16 crc, const __u8 data)
+{
+	return (crc >> 8) ^ crc16_table[(crc ^ data) & 0xff];
+}
+
+__u16 crc16(__u16 crc, __u8 const *buffer, size_t len)
+{
+	while (len--)
+		crc = crc16_byte(crc, *buffer++);
+	return crc;
+}
+#endif
+
+__le16 ext3_group_desc_csum(struct ext3_sb_info *sbi, __u32 block_group,
+			    struct ext3_group_desc *gdp)
+{
+	__u16 crc = 0;
+
+	if (sbi->s_es->s_feature_ro_compat &
+	    cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+		int offset = offsetof(struct ext3_group_desc, bg_checksum);
+		__le32 le_group = cpu_to_le32(block_group);
+
+		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+		crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+		crc = crc16(crc, (__u8 *)gdp, offset);
+		offset += sizeof(gdp->bg_checksum); /* skip checksum */
+		BUG_ON(offset != sizeof(*gdp)); /* XXX handle s_desc_size */
+		/* for checksum of struct ext4_group_desc do the rest...
+		if (offset < sbi->s_es->s_desc_size) {
+			crc = crc16(crc, (__u8 *)gdp + offset,
+				    sbi->s_es->s_desc_size - offset);
+		 */
+	}
+
+	return cpu_to_le16(crc);
+}
+
+int ext3_group_desc_csum_verify(struct ext3_sb_info *sbi, __u32 block_group,
+				struct ext3_group_desc *gdp)
+{
+	if (gdp->bg_checksum != ext3_group_desc_csum(sbi, block_group, gdp))
+		return 0;
+
+	return 1;
+}
+
 /* Called at mount-time, super-block is locked */
 static int ext3_check_descriptors (struct super_block * sb)
 {
@@ -1270,6 +1355,13 @@ static int ext3_check_descriptors (struc
 					le32_to_cpu(gdp->bg_inode_table));
 			return 0;
 		}
+		if (!ext3_group_desc_csum_verify(sbi, i, gdp)) {
+			ext3_error(sb, __FUNCTION__,
+				   "Checksum for group %d failed (%u!=%u)\n", i,
+				   le16_to_cpu(ext3_group_desc_csum(sbi,i,gdp)),
+				   le16_to_cpu(gdp->bg_checksum));
+			return 0;
+		}
 		block += EXT3_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
Index: linux-2.6.16.27-0.9-full/fs/ext3/group.h
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/group.h	2007-02-13 18:39:59.640066087 +0300
+++ linux-2.6.16.27-0.9-full/fs/ext3/group.h	2007-03-28 18:30:06.000000000 +0400
@@ -0,0 +1,29 @@
+/*
+ *  linux/fs/ext3/group.h
+ *
+ * Copyright (C) 2007 Cluster File Systems, Inc
+ *
+ * Author: Andreas Dilger <adilger@...sterfs.com>
+ */
+
+#ifndef _LINUX_EXT3_GROUP_H
+#define _LINUX_EXT3_GROUP_H
+#if defined(CONFIG_CRC16) || defined(CONFIG_CRC16_MODULE)
+#include <linux/crc16.h>
+#endif
+
+extern __le16 ext3_group_desc_csum(struct ext3_sb_info *sbi, __u32 group,
+				   struct ext3_group_desc *gdp);
+extern int ext3_group_desc_csum_verify(struct ext3_sb_info *sbi, __u32 group,
+				       struct ext3_group_desc *gdp);
+struct buffer_head *read_block_bitmap(struct super_block *sb,
+				      unsigned int block_group);
+extern unsigned ext3_init_block_bitmap(struct super_block *sb,
+				       struct buffer_head *bh, int group,
+				       struct ext3_group_desc *desc);
+#define ext3_free_blocks_after_init(sb, group, desc)			\
+		ext3_init_block_bitmap(sb, NULL, group, desc)
+extern unsigned ext3_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh, int group,
+				       struct ext3_group_desc *desc);
+#endif /* _LINUX_EXT3_GROUP_H */
Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/ialloc.c	2007-03-28 18:20:17.000000000 +0400
+++ linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c	2007-03-28 18:30:06.000000000 +0400
@@ -28,6 +28,7 @@
 
 #include "xattr.h"
 #include "acl.h"
+#include "group.h"
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
@@ -43,6 +44,52 @@
  * the free blocks count in the block.
  */
 
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext3_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/* Initializes an uninitialized inode bitmap */
+unsigned ext3_init_inode_bitmap(struct super_block *sb,
+				struct buffer_head *bh, int block_group,
+				struct ext3_group_desc *gdp)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks and inodes use to prevent
+	 * allocation, essentially implementing a per-group read-only flag. */
+	if (!ext3_group_desc_csum_verify(sbi, block_group, gdp)) {
+		ext3_error(sb, __FUNCTION__, "Checksum bad for group %u\n",
+			   block_group);
+		gdp->bg_free_blocks_count = 0;
+		gdp->bg_free_inodes_count = 0;
+		gdp->bg_itable_unused = 0;
+		memset(bh->b_data, 0xff, sb->s_blocksize);
+		return 0;
+	}
+
+	memset(bh->b_data, 0, (EXT3_INODES_PER_GROUP(sb) + 7) / 8);
+	mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+
+	return EXT3_INODES_PER_GROUP(sb);
+}
 
 /*
  * Read the inode allocation bitmap for a given block_group, reading
@@ -59,8 +106,19 @@ read_inode_bitmap(struct super_block * s
 	desc = ext3_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		goto error_out;
-
-	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	if (desc->bg_flags & cpu_to_le16(EXT3_BG_INODE_UNINIT)) {
+		bh = sb_getblk(sb, le32_to_cpu(desc->bg_inode_bitmap));
+		if (!buffer_uptodate(bh)) {
+			lock_buffer(bh);
+			if (!buffer_uptodate(bh)) {
+				ext3_init_inode_bitmap(sb, bh,block_group,desc);
+				set_buffer_uptodate(bh);
+			}
+			unlock_buffer(bh);
+		}
+	} else {
+		bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
+	}
 	if (!bh)
 		ext3_error(sb, "read_inode_bitmap",
 			    "Cannot read inode bitmap - "
@@ -169,6 +227,8 @@ void ext3_free_inode (handle_t *handle, 
 			if (is_directory)
 				gdp->bg_used_dirs_count = cpu_to_le16(
 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+			gdp->bg_checksum = ext3_group_desc_csum(sbi,block_group,
+								gdp);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
 			percpu_counter_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
@@ -453,7 +513,7 @@ struct inode *ext3_new_inode(handle_t *h
 	struct ext3_sb_info *sbi;
 	int err = 0;
 	struct inode *ret;
-	int i;
+	int i, free = 0;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -570,11 +630,13 @@ repeat_in_this_group:
 	goto out;
 
 got:
-	ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
-	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext3_error (sb, "ext3_new_inode",
-			    "reserved inode or inode > inodes count - "
-			    "block_group = %d, inode=%lu", group, ino);
+	ino++;
+	if ((group == 0 && ino < EXT3_FIRST_INO(sb)) ||
+	    ino > EXT3_INODES_PER_GROUP(sb)) {
+		ext3_error(sb, __FUNCTION__,
+			   "reserved inode or inode > inodes count - "
+			   "block_group = %d, inode=%lu", group,
+			   ino + group * EXT3_INODES_PER_GROUP(sb));
 		err = -EIO;
 		goto fail;
 	}
@@ -582,13 +644,65 @@ got:
 	BUFFER_TRACE(bh2, "get_write_access");
 	err = ext3_journal_get_write_access(handle, bh2);
 	if (err) goto fail;
+
+	/* We may have to initialize the block bitmap if it isn't already */
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+	    gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) {
+		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+
+		BUFFER_TRACE(block_bh, "get block bitmap access");
+		err = ext3_journal_get_write_access(handle, block_bh);
+		if (err) {
+			brelse(block_bh);
+			goto fail;
+		}
+
+		free = 0;
+		spin_lock(sb_bgl_lock(sbi, group));
+		/* recheck and clear flag under lock if we still need to */
+		if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT3_BG_BLOCK_UNINIT);
+			free = ext3_free_blocks_after_init(sb, group, gdp);
+			gdp->bg_free_blocks_count = cpu_to_le16(free);
+		}
+		spin_unlock(sb_bgl_lock(sbi, group));
+
+		/* Don't need to dirty bitmap block if we didn't change it */
+		if (free) {
+			BUFFER_TRACE(block_bh, "dirty block bitmap");
+			err = ext3_journal_dirty_metadata(handle, block_bh);
+		}
+
+		brelse(block_bh);
+		if (err)
+			goto fail;
+	}
+
 	spin_lock(sb_bgl_lock(sbi, group));
+	/* If we didn't allocate from within the initialized part of the inode
+	 * table then we need to initialize up to this inode. */
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+		if (gdp->bg_flags & cpu_to_le16(EXT3_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT3_BG_INODE_UNINIT);
+			free = EXT3_INODES_PER_GROUP(sb);
+		} else {
+			free = EXT3_INODES_PER_GROUP(sb) -
+				le16_to_cpu(gdp->bg_itable_unused);
+		}
+
+		if (ino > free) {
+			gdp->bg_itable_unused =
+				cpu_to_le16(EXT3_INODES_PER_GROUP(sb) - ino);
+		}
+	}
+
 	gdp->bg_free_inodes_count =
 		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
 	if (S_ISDIR(mode)) {
 		gdp->bg_used_dirs_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
 	}
+	gdp->bg_checksum = ext3_group_desc_csum(sbi, group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group));
 	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
 	err = ext3_journal_dirty_metadata(handle, bh2);
@@ -610,7 +724,7 @@ got:
 		inode->i_gid = current->fsgid;
 	inode->i_mode = mode;
 
-	inode->i_ino = ino;
+	inode->i_ino = ino + group * EXT3_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/mballoc.c	2007-03-28 16:03:19.000000000 +0400
+++ linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c	2007-03-28 18:30:36.000000000 +0400
@@ -36,6 +36,8 @@
 #include <linux/seq_file.h>
 #include <linux/version.h>
 
+#include "group.h"
+
 /*
  * MUSTDO:
  *   - test ext3_ext_search_left() and ext3_ext_search_right()
@@ -323,6 +325,7 @@ struct ext3_group_info {
 	unsigned long	bb_state;
 	unsigned long 	bb_tid;
 	struct ext3_free_metadata *bb_md_cur;
+	struct ext3_group_desc *bb_gdp;
 	unsigned short	bb_first_free;
 	unsigned short	bb_free;
 	unsigned short	bb_fragments;
@@ -928,10 +931,7 @@ static int ext3_mb_init_cache(struct pag
 		if (first_group + i >= EXT3_SB(sb)->s_groups_count)
 			break;
 
-		err = -EIO;
-		desc = ext3_get_group_desc(sb, first_group + i, NULL);
-		if (desc == NULL)
-			goto out;
+		desc = EXT3_GROUP_INFO(sb, first_group + i)->bb_gdp;
 
 		err = -ENOMEM;
 		bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
@@ -946,7 +946,12 @@ static int ext3_mb_init_cache(struct pag
 			unlock_buffer(bh[i]);
 			continue;
 		}
-
+		if (desc->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) {
+			ext3_init_block_bitmap(sb, bh[i], first_group + i,desc);
+			set_buffer_uptodate(bh[i]);
+			unlock_buffer(bh[i]);
+			continue;
+		}
 		get_bh(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
@@ -1703,6 +1708,10 @@ static int ext3_mb_good_group(struct ext
 	switch (cr) {
 		case 0:
 			BUG_ON(ac->ac_2order == 0);
+			/* If this group is uninitialized, skip it initially */
+			if (grp->bb_gdp->bg_flags &
+			    cpu_to_le16(EXT3_BG_BLOCK_UNINIT))
+				return 0;
 			bits = ac->ac_sb->s_blocksize_bits + 1;
 			for (i = ac->ac_2order; i <= bits; i++)
 				if (grp->bb_counters[i] > 0)
@@ -1796,7 +1805,9 @@ repeat:
 			}
 
 			ac->ac_groups_scanned++;
-			if (cr == 0)
+			if (cr == 0 || (e3b.bd_info->bb_gdp->bg_flags &
+					cpu_to_le16(EXT3_BG_BLOCK_UNINIT) &&
+					ac->ac_2order != 0))
 				ext3_mb_simple_scan_group(ac, &e3b);
 			else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe)
 				ext3_mb_scan_aligned(ac, &e3b);
@@ -2267,12 +2278,13 @@ int ext3_mb_init_backend(struct super_bl
 			i--;
 			goto err_freebuddy;
 		}
+		memset(meta_group_info[j], 0, len);
 		desc = ext3_get_group_desc(sb, i, NULL);
+		meta_group_info[j]->bb_gdp = desc;
 		if (desc == NULL) {
 			printk(KERN_ERR"EXT3-fs: can't read descriptor %u\n",i);
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
 		set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
 			&meta_group_info[j]->bb_state);
 
@@ -2936,9 +2948,17 @@ int ext3_mb_mark_diskspace_used(struct e
 	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
 
 	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) {
+		gdp->bg_flags &= cpu_to_le16(~EXT3_BG_BLOCK_UNINIT);
+		gdp->bg_free_blocks_count =
+			cpu_to_le16(ext3_free_blocks_after_init(sb,
+							    ac->ac_b_ex.fe_group,
+							    gdp));
+	}
 	gdp->bg_free_blocks_count =
 		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
 				- ac->ac_b_ex.fe_len);
+	gdp->bg_checksum = ext3_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
 	percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len);
 
@@ -4303,6 +4323,7 @@ do_more:
 	spin_lock(sb_bgl_lock(sbi, block_group));
 	gdp->bg_free_blocks_count =
 		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
+	gdp->bg_checksum = ext3_group_desc_csum(sbi, block_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
 
Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c
===================================================================
--- linux-2.6.16.27-0.9-full.orig/fs/ext3/balloc.c	2007-03-28 16:03:20.000000000 +0400
+++ linux-2.6.16.27-0.9-full/fs/ext3/balloc.c	2007-03-28 18:30:06.000000000 +0400
@@ -21,6 +21,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 
+#include "group.h"
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
@@ -74,6 +75,75 @@ struct ext3_group_desc * ext3_get_group_
 	return desc + offset;
 }
 
+/* Initializes an uninitialized block bitmap if given, and returns the
+ * number of blocks free in the group. */
+unsigned ext3_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
+				int block_group, struct ext3_group_desc *gdp)
+{
+	unsigned long start;
+	int bit, bit_max;
+	unsigned free_blocks;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+
+	if (bh) {
+		J_ASSERT_BH(bh, buffer_locked(bh));
+
+		/* If checksum is bad mark all blocks use to prevent allocation,
+		 * essentially implementing a per-group read-only flag. */
+		if (!ext3_group_desc_csum_verify(sbi, block_group, gdp)) {
+			ext3_error(sb, __FUNCTION__,
+				   "Checksum bad for group %u\n", block_group);
+			gdp->bg_free_blocks_count = 0;
+			gdp->bg_free_inodes_count = 0;
+			gdp->bg_itable_unused = 0;
+			memset(bh->b_data, 0xff, sb->s_blocksize);
+			return 0;
+		}
+		memset(bh->b_data, 0, sb->s_blocksize);
+	}
+
+	/* Check for superblock and gdt backups in this group */
+	bit_max = ext3_bg_has_super(sb, block_group);
+
+	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
+	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+			  sbi->s_desc_per_block) {
+		if (bit_max) {
+			bit_max += ext3_bg_num_gdb(sb, block_group);
+			bit_max +=le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		}
+	} else { /* For META_BG_BLOCK_GROUPS */
+		int group_rel = (block_group -
+				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
+				EXT3_DESC_PER_BLOCK(sb);
+		if (group_rel == 0 || group_rel == 1 ||
+		    (group_rel == EXT3_DESC_PER_BLOCK(sb) - 1))
+			bit_max += 1;
+	}
+
+	/* Last and first groups are always initialized */
+	free_blocks = EXT3_BLOCKS_PER_GROUP(sb) - bit_max;
+
+	if (bh) {
+		for (bit = 0; bit < bit_max; bit++)
+			ext3_set_bit(bit, bh->b_data);
+
+		start = block_group * EXT3_BLOCKS_PER_GROUP(sb) +
+			le32_to_cpu(sbi->s_es->s_first_data_block);
+
+		/* Set bits for block and inode bitmaps, and inode table */
+		ext3_set_bit(le32_to_cpu(gdp->bg_block_bitmap) - start,
+			     bh->b_data);
+		ext3_set_bit(le32_to_cpu(gdp->bg_inode_bitmap) - start,
+			     bh->b_data);
+		for (bit = le32_to_cpu(gdp->bg_inode_table) - start,
+		     bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
+			ext3_set_bit(bit, bh->b_data);
+	}
+
+	return free_blocks - sbi->s_itb_per_group - 2;
+}
+
 /*
  * Read the bitmap for a given block_group, reading into the specified 
  * slot in the superblock's bitmap cache.
@@ -89,7 +159,19 @@ read_block_bitmap(struct super_block *sb
 	desc = ext3_get_group_desc (sb, block_group, NULL);
 	if (!desc)
 		goto error_out;
-	bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
+	if (desc->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) {
+		bh = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
+		if (!buffer_uptodate(bh)) {
+			lock_buffer(bh);
+			if (!buffer_uptodate(bh)) {
+				ext3_init_block_bitmap(sb, bh,block_group,desc);
+				set_buffer_uptodate(bh);
+			}
+			unlock_buffer(bh);
+		}
+	} else {
+		bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
+	}
 	if (!bh)
 		ext3_error (sb, "read_block_bitmap",
 			    "Cannot read block bitmap - "
@@ -468,6 +550,7 @@ do_more:
 	desc->bg_free_blocks_count =
 		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
 			group_freed);
+	desc->bg_checksum = ext3_group_desc_csum(sbi, block_group, desc);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
 
@@ -1378,8 +1461,11 @@ allocated:
 			ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
+	if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT))
+		gdp->bg_flags &= cpu_to_le16(~EXT3_BG_BLOCK_UNINIT);
 	gdp->bg_free_blocks_count =
 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
+	gdp->bg_checksum = ext3_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
 
%diffstat
 fs/ext3/balloc.c        |   88 +++++++++++++++++++++++++++++
 fs/ext3/group.h         |   38 ++++++++++++
 fs/ext3/ialloc.c        |  144 +++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext3/mballoc.c       |   35 +++++++++--
 fs/ext3/resize.c        |    2 
 fs/ext3/super.c         |   92 ++++++++++++++++++++++++++++++
 include/linux/ext3_fs.h |   16 ++++-
 7 files changed, 388 insertions(+), 27 deletions(-)
====================== ext3-uninit-2.6.16-sles10.patch =================

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ